In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn import preprocessing

In [2]:
# read traning data
df = pd.read_csv("training_data.csv", low_memory=False)
start_index = list(df.columns).index('Claim_Count')

In [3]:
target = df['Loss_Amount']
df = df.drop(df.columns[start_index:], axis=1)
df

Unnamed: 0,PolicyNo,Policy_Company,Policy_Installment_Term,Policy_Billing_Code,Policy_Method_Of_Payment,Policy_Reinstatement_Fee_Indicator,Policy_Zip_Code_Garaging_Location,Vehicle_Territory,Vehicle_Make_Year,Vehicle_Make_Description,...,EEA_Policy_Zip_Code_3,EEA_Policy_Tenure,EEA_Agency_Type,EEA_Packaged_Policy_Indicator,EEA_Full_Coverage_Indicator,EEA_Prior_Bodily_Injury_Limit,EEA_PolicyYear,SYS_Renewed,SYS_New_Business,Annual_Premium
0,164532941,Standard,6,Direct Billed to Insured,Pre-paid,N,43046,35,2004,BUIK LESABRE LI,...,430,22.7,Standard,N,Y,100-400,2006,Y,N,320.12
1,164533241,Standard,6,Direct Billed to Insured,Pre-paid,N,Unknown,35,1980,CADILLAC 4-DOOR,...,Unknown,47.1,Preferred,N,Y,100-200,2006,Y,N,259.70
2,164534633,Standard,6,Direct Billed to Insured,Pre-paid,N,43555,17,2005,PONT MONTANA SV,...,435,47.2,Non-standard,N,Y,100-400,2006,Y,N,613.74
3,164534839,Standard,6,Direct Billed to Insured,Pre-paid,N,43561,17,2005,MERC GRAND MARQ,...,435,46.7,Non-standard,Y,Y,40-100,2006,Y,N,541.66
4,164534840,Standard,6,Direct Billed to Insured,Pre-paid,N,43561,17,2005,MERC GRAND MARQ,...,435,47.2,Non-standard,Y,Y,40-100,2006,Y,N,541.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424426,381713000,Standard,6,Direct Billed to Insured,Pre-paid,N,42851,35,1999,PONT GR PRIX GT,...,428,0.0,Preferred,N,Y,,2006,Y,Y,162.55
424427,381735600,Standard,6,Direct Billed to Insured,Pre-paid,N,43669,31,2000,NSSN QUEST,...,436,0.0,Hybrid,Y,N,100-200,2006,Y,Y,117.13
424428,382057400,Standard,6,Direct Billed to Insured,Installment,N,42487,35,1997,PONT TRANSSPORT,...,424,0.0,Preferred,N,Y,100-400,2006,N,Y,118.21
424429,382162500,Preferred,6,Direct Billed to Insured,Installment,N,43360,31,1998,PONT SUNFIRE SE,...,433,0.0,Non-standard,N,N,40-100,2006,N,Y,103.93


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import copy

def drop_unknown(df):
    return df.dropna()

def label_encode(df):
    le = preprocessing.LabelEncoder()
    for col in df.columns:
        col = str(col)
        if str(df.loc[:,col].dtype) == 'object':
            le.fit(df.loc[:,col]) 
            df.loc[:,col] = le.transform(df.loc[:,col])
    return df

def clean_data(df, missing_handler=drop_unknown, data_encoder=label_encode):
    df = missing_handler(df)
    return data_encoder(df)

def pca(df, num_components):
    pca = PCA(num_components).fit(df)
    principal_cols = pca.transform(df)
    return pd.DataFrame(data = principal_cols)
    

def preprocess_data(df, **params):
    """
    Preprocesses dataframe, with customizable options.
    
    params:
        dropped_columns[list]: list of columns to be dropped before preprocessing begins.
    
        clean[Boolean]: should the data be cleaned.
        
        missing_handler[function(dataframe) returns dataframe]: how to handle missing data,
                       'dropna' by default.
                                             
        data_encoder[function(dataframe) returns dataframe]: specifies encoder for data, 
                    'label encoding' is the default.
        
        feature_transform[Boolean]: does feature transformation need to be performed.
        
        feature_transformer[function(dataframe) returns dataframe]: specifies the feature transformer,
                        'Standardization' is the default.
                                          
        feature_selection[Boolean]: does feature selection need to be performed.
        
        feature_selector[function(dataframe) returns dataframe]: specifies the feature selector,
                        'PCA' is the default.
                        
        num_components[int]: specifies the number of principal components we want.
                                 Default is 30 components.
        
    NOTE: all boolean params are set to 'False' by default.
          So the call 'preprocessor()' does nothing, because no arguments are passed.
    """
    drop_cols = params.get('dropped_columns', [])
    df = df.drop(drop_cols, axis=1)
    
    do_clean = params.get('clean', False)
    if do_clean:
        missing_handler = params.get('missing_handler', drop_unknown)
        data_encoder = params.get('data_encoder', label_encode)
        df = clean_data(df, missing_handler, data_encoder)
    
    do_transform = params.get('feature_transform', False)
    if do_transform:
        transformer = params.get('feature_transformer', StandardScaler().fit_transform)
        scaled_features = transformer(df)
        df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
        
    do_feature_selection = params.get('feature_selection', False)
    num_components = params.get('num_components', 30)
    if do_feature_selection:
        feature_selector = params.get('feature_selector', pca)
        df = feature_selector(df, num_components)
    
    return df

In [5]:
df_clean = preprocess_data(df, dropped_columns=['PolicyNo'], clean=True, remove_outliers=False,
                           feature_transform=True, feature_selection=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [6]:
df_clean['loss'] = target
premiums = preprocess_data(df, dropped_columns=['PolicyNo'], clean=True, remove_outliers=False,
                           feature_transform=False, feature_selection=False)
df_clean['premium'] = premiums['Annual_Premium']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [7]:
import random

def make_portfolios(df):
    with_claims = []
    no_claims = []
    for i, r in enumerate(df['loss']):
        if r > 0.0:
            with_claims.append(i)
        else:
            no_claims.append(i)
            
    random.shuffle(with_claims)
    random.shuffle(no_claims)
    
    ratio = int(len(no_claims)/len(with_claims))
    per_portfolio = int(1000/ratio)
    
    possible_claims_portfolios = int(len(with_claims)/(per_portfolio))
    possible_noclaims_portfolios = int(len(no_claims)/(per_portfolio*ratio))
    
    num_portfolios = min(possible_claims_portfolios, possible_noclaims_portfolios)
    
    portfolios = []
    for i in range(num_portfolios):
        portfolio = []
        for j in range(per_portfolio):
            for k in range(ratio):
                portfolio.append(df.iloc[no_claims.pop()])
            portfolio.append(df.iloc[with_claims.pop()])
            random.shuffle(portfolio)
        portfolios.append(pd.DataFrame(portfolio, columns=df.columns))    
        
    return portfolios

In [8]:
portfolios = make_portfolios(df_clean)

In [10]:
def mean_summarizer(df_t):
    values = df_t.mean()
    n_df = pd.DataFrame([], columns=df_t.columns)
    return n_df.append(values, ignore_index=True)    

def summarize_portfolios(portfolios, summarizer=mean_summarizer, convert_to_lr=False):
    summarized_portfolios = []
    flag = True
    for p in portfolios:
        portfolio = summarizer(p)
        if convert_to_lr:
            portfolio['lr'] = np.log(portfolio['loss']/portfolio['premium'])
            portfolio = portfolio.drop(['loss', 'premium'], axis=1)
        if flag:
            cols = portfolio.columns
            flag = False
        summarized_portfolios.append(portfolio)
        
    if cols is None:
        return None
    n_df = pd.DataFrame([], columns=cols)
    for p in summarized_portfolios:
        n_df = n_df.append(p, ignore_index=True)
#     print(n_df)
    return n_df

In [11]:
training_set = summarize_portfolios(portfolios, convert_to_lr=True)

In [101]:
import glob


import warnings
warnings.filterwarnings('ignore')


def get_testing_portfolios(folder_name):
    path = 'testing_portfolios' # use your path
    all_files = glob.glob(path + "/*.csv")
    print(len(all_files))
    portfolios = []
    i = 0
    for filename in all_files:
        p = pd.read_csv(filename, index_col=None, header=0)
        p = preprocess_data(p, dropped_columns=['PolicyNo'], clean=True, remove_outliers=False,
                            feature_transform=True, feature_selection=True)
        portfolios.append(p)
    return summarize_portfolios(portfolios)

In [71]:
t_p = get_testing_portfolios('testing_portfolios')

330


In [13]:
losses = training_set['lr']

In [59]:
ds = [[1, 2, 3],[4, 5, 6]]
p_ds = pd.DataFrame(ds)
mean_summarizer(p_ds)

Unnamed: 0,0,1,2
0,2.5,3.5,4.5


In [75]:
t_p.to_csv('testing_portfolios_median.csv')

In [113]:
losses = training_set['loss']

In [105]:
target

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
424426    0.0
424427    0.0
424428    0.0
424429    0.0
424430    0.0
Name: Loss_Amount, Length: 424431, dtype: float64

In [16]:
losses

0     -0.509866
1     -0.767435
2     -0.332697
3     -0.435303
4     -0.568037
         ...   
375   -0.528748
376    0.326807
377   -0.452011
378   -0.636387
379   -0.668743
Name: lr, Length: 380, dtype: float64

In [118]:
training_set = training_set.drop('loss', axis=1)

In [11]:
training_set

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,lr
0,0.013194,0.063571,0.088012,0.009799,0.021530,0.060274,0.074977,0.028069,-0.054494,-0.025556,...,-0.013276,-0.022436,0.011691,-0.006522,0.022634,-0.037710,-0.004630,-0.000122,0.025122,-0.447475
1,-0.001657,-0.016529,-0.001698,-0.023342,0.031382,0.010239,0.024535,-0.043607,0.003146,0.068984,...,-0.021568,-0.001560,-0.065198,-0.006465,-0.025500,-0.003053,-0.007712,-0.033729,0.041375,-0.708914
2,-0.168495,-0.056902,0.037373,-0.039389,0.003843,0.020237,0.032745,0.013362,-0.019529,-0.056134,...,-0.005160,-0.016093,0.023833,0.009690,-0.034927,0.029056,0.011031,-0.032125,0.004241,-0.669580
3,0.065422,-0.112453,0.023047,-0.075998,-0.061645,-0.026318,-0.003452,-0.015500,-0.029477,-0.008439,...,-0.006161,-0.013677,-0.010045,-0.014879,-0.039260,0.060832,0.029565,-0.018894,-0.006463,-0.655944
4,0.079353,-0.078708,0.052352,0.020750,0.040520,0.006239,0.004638,-0.070893,-0.005416,-0.045614,...,-0.008746,-0.017978,-0.023423,-0.005638,-0.003815,0.021324,-0.006199,0.013405,-0.029775,-0.632838
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,0.074270,0.025524,-0.004397,-0.037993,0.007221,0.010578,0.023441,0.044455,-0.026937,-0.074626,...,-0.008247,-0.021883,0.032083,0.048526,-0.063318,-0.025847,0.017867,-0.031623,0.008319,0.606037
376,0.102476,-0.005623,-0.051868,0.098793,-0.005386,-0.023201,-0.009690,0.040823,-0.035976,-0.012860,...,0.044215,0.002330,-0.003159,0.000519,-0.020201,-0.049859,-0.040714,-0.029183,-0.004049,0.249668
377,-0.038408,0.094132,0.052616,0.029553,0.086855,0.043294,-0.040265,0.036500,-0.002072,-0.046976,...,-0.009608,-0.017151,-0.020038,0.037361,-0.041571,0.004421,0.019487,-0.010844,0.046185,-0.269027
378,-0.011267,-0.069999,0.032222,-0.031614,-0.054038,0.000026,0.075849,-0.007304,0.059886,0.057291,...,0.056129,0.006769,0.043736,0.039515,-0.021966,0.004106,0.023296,0.016896,-0.009466,-0.071285


In [19]:
training_set.drop('lr', axis=1).to_csv('training_median.csv', index=False)

In [18]:
losses = training_set['lr']
losses.to_csv('training_loss_ratios.csv', index=False, header=False)

In [124]:
df_clean

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,loss
0,0.822511,-3.197911,-3.683301,0.685306,-0.345284,1.083684,-0.362420,1.615885,0.571659,-0.318568,...,0.164241,-0.173110,0.059358,0.681950,-0.015497,0.202153,-1.738823,0.126884,0.167264,0.0
1,-2.367808,-3.625519,-3.769293,2.885057,-1.727100,2.069011,-2.432538,-1.353834,6.598667,2.288432,...,0.172229,-0.206500,0.871788,-0.469211,-1.653286,-0.300808,-2.636556,2.168907,0.057289,0.0
2,0.955761,-3.940034,-3.658091,1.559538,1.220638,2.289073,-1.632745,1.447898,2.440878,-0.465305,...,-0.088670,-0.547785,-2.422177,0.457459,2.474113,1.891349,0.569253,-2.981146,0.466576,0.0
3,1.100196,-3.617575,-2.599305,1.138038,0.308002,3.214747,-2.430302,-1.194018,0.817247,-1.260229,...,-0.094377,-0.151103,-1.033110,-0.588894,1.709334,-0.797130,0.470101,-1.846555,1.864802,0.0
4,1.085183,-3.670813,-2.632366,1.162059,0.307024,3.232055,-2.441274,-1.180598,0.828931,-1.280191,...,-0.094397,-0.151596,-1.033970,-0.602443,1.711226,-0.805447,0.461692,-1.848466,1.877393,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407100,2.198992,6.174036,-1.849284,0.507613,1.200152,1.030194,0.066078,-0.778142,-0.621469,0.776300,...,0.260143,-0.111771,0.747712,-2.254769,0.512560,0.755710,-1.282370,-0.216729,-0.416225,0.0
407101,4.085266,0.511354,0.851459,-2.138276,4.629585,1.396822,6.593259,-1.886976,2.365411,-1.683856,...,-0.019167,-0.167312,-0.332209,0.172868,1.184639,-0.612740,0.332569,0.274275,-1.084096,0.0
407102,-0.783451,6.658742,-2.402172,2.054218,-0.406915,-1.430170,0.817435,-0.200958,0.341388,1.381247,...,-0.082776,0.109984,0.608705,-0.915793,-1.100528,-0.139049,-0.429336,-0.239407,-2.483183,0.0
407103,0.209536,-1.043600,-0.581526,-0.583789,1.830679,-0.378196,1.548151,0.957460,-2.732321,1.565845,...,0.081812,-0.292641,-1.164162,-1.867922,2.249329,1.614157,0.853153,-1.518733,-1.524806,0.0


In [20]:
from sklearn.linear_model import LinearRegression
X = pd.read_csv('training_median.csv', index=False)
reg = LinearRegression().fit(X, losses)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.013194,0.063571,0.088012,0.009799,0.021530,0.060274,0.074977,0.028069,-0.054494,-0.025556,...,-0.030094,-0.013276,-0.022436,0.011691,-0.006522,0.022634,-0.037710,-0.004630,-0.000122,0.025122
1,-0.001657,-0.016529,-0.001698,-0.023342,0.031382,0.010239,0.024535,-0.043607,0.003146,0.068984,...,-0.087160,-0.021568,-0.001560,-0.065198,-0.006465,-0.025500,-0.003053,-0.007712,-0.033729,0.041375
2,-0.168495,-0.056902,0.037373,-0.039389,0.003843,0.020237,0.032745,0.013362,-0.019529,-0.056134,...,-0.000458,-0.005160,-0.016093,0.023833,0.009690,-0.034927,0.029056,0.011031,-0.032125,0.004241
3,0.065422,-0.112453,0.023047,-0.075998,-0.061645,-0.026318,-0.003452,-0.015500,-0.029477,-0.008439,...,0.019828,-0.006161,-0.013677,-0.010045,-0.014879,-0.039260,0.060832,0.029565,-0.018894,-0.006463
4,0.079353,-0.078708,0.052352,0.020750,0.040520,0.006239,0.004638,-0.070893,-0.005416,-0.045614,...,-0.000284,-0.008746,-0.017978,-0.023423,-0.005638,-0.003815,0.021324,-0.006199,0.013405,-0.029775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,0.074270,0.025524,-0.004397,-0.037993,0.007221,0.010578,0.023441,0.044455,-0.026937,-0.074626,...,0.004513,-0.008247,-0.021883,0.032083,0.048526,-0.063318,-0.025847,0.017867,-0.031623,0.008319
376,0.102476,-0.005623,-0.051868,0.098793,-0.005386,-0.023201,-0.009690,0.040823,-0.035976,-0.012860,...,-0.010727,0.044215,0.002330,-0.003159,0.000519,-0.020201,-0.049859,-0.040714,-0.029183,-0.004049
377,-0.038408,0.094132,0.052616,0.029553,0.086855,0.043294,-0.040265,0.036500,-0.002072,-0.046976,...,-0.008539,-0.009608,-0.017151,-0.020038,0.037361,-0.041571,0.004421,0.019487,-0.010844,0.046185
378,-0.011267,-0.069999,0.032222,-0.031614,-0.054038,0.000026,0.075849,-0.007304,0.059886,0.057291,...,-0.047949,0.056129,0.006769,0.043736,0.039515,-0.021966,0.004106,0.023296,0.016896,-0.009466


In [22]:
reg.score(X, losses)

0.0612799283492892

In [26]:
reg.predict(X)

array([-0.44513726, -0.50251806, -0.27290356, -0.47367025, -0.45830999,
       -0.45300557, -0.28228123, -0.3749926 , -0.46197278, -0.32352304,
       -0.33454921, -0.49079099, -0.29755865, -0.45825602, -0.34218058,
       -0.33090566, -0.45927534, -0.49494554, -0.40834421, -0.33014576,
       -0.43924518, -0.40736567, -0.31734305, -0.30808717, -0.25406564,
       -0.13088226, -0.44102477, -0.53109903, -0.37245684, -0.42934862,
       -0.40851232, -0.47388445, -0.35339001, -0.50898469, -0.45493776,
       -0.37621079, -0.3495192 , -0.19162937, -0.35744313, -0.48979294,
       -0.40768777, -0.35863786, -0.40838566, -0.55179176, -0.49282487,
       -0.52230278, -0.39244902, -0.3710549 , -0.52541381, -0.37462589,
       -0.53982485, -0.50350087, -0.42415267, -0.28203195, -0.25489556,
       -0.47053966, -0.48248748, -0.37935066, -0.22498919, -0.46979966,
       -0.58388129, -0.08762152, -0.47618866, -0.47786175, -0.38271526,
       -0.48726736, -0.36782289, -0.43093281, -0.3707852 , -0.23

In [27]:
losses

0     -0.447475
1     -0.708914
2     -0.669580
3     -0.655944
4     -0.632838
         ...   
375    0.606037
376    0.249668
377   -0.269027
378   -0.071285
379   -0.854080
Name: lr, Length: 380, dtype: float64

In [28]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.013194,0.063571,0.088012,0.009799,0.021530,0.060274,0.074977,0.028069,-0.054494,-0.025556,...,-0.030094,-0.013276,-0.022436,0.011691,-0.006522,0.022634,-0.037710,-0.004630,-0.000122,0.025122
1,-0.001657,-0.016529,-0.001698,-0.023342,0.031382,0.010239,0.024535,-0.043607,0.003146,0.068984,...,-0.087160,-0.021568,-0.001560,-0.065198,-0.006465,-0.025500,-0.003053,-0.007712,-0.033729,0.041375
2,-0.168495,-0.056902,0.037373,-0.039389,0.003843,0.020237,0.032745,0.013362,-0.019529,-0.056134,...,-0.000458,-0.005160,-0.016093,0.023833,0.009690,-0.034927,0.029056,0.011031,-0.032125,0.004241
3,0.065422,-0.112453,0.023047,-0.075998,-0.061645,-0.026318,-0.003452,-0.015500,-0.029477,-0.008439,...,0.019828,-0.006161,-0.013677,-0.010045,-0.014879,-0.039260,0.060832,0.029565,-0.018894,-0.006463
4,0.079353,-0.078708,0.052352,0.020750,0.040520,0.006239,0.004638,-0.070893,-0.005416,-0.045614,...,-0.000284,-0.008746,-0.017978,-0.023423,-0.005638,-0.003815,0.021324,-0.006199,0.013405,-0.029775
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,0.074270,0.025524,-0.004397,-0.037993,0.007221,0.010578,0.023441,0.044455,-0.026937,-0.074626,...,0.004513,-0.008247,-0.021883,0.032083,0.048526,-0.063318,-0.025847,0.017867,-0.031623,0.008319
376,0.102476,-0.005623,-0.051868,0.098793,-0.005386,-0.023201,-0.009690,0.040823,-0.035976,-0.012860,...,-0.010727,0.044215,0.002330,-0.003159,0.000519,-0.020201,-0.049859,-0.040714,-0.029183,-0.004049
377,-0.038408,0.094132,0.052616,0.029553,0.086855,0.043294,-0.040265,0.036500,-0.002072,-0.046976,...,-0.008539,-0.009608,-0.017151,-0.020038,0.037361,-0.041571,0.004421,0.019487,-0.010844,0.046185
378,-0.011267,-0.069999,0.032222,-0.031614,-0.054038,0.000026,0.075849,-0.007304,0.059886,0.057291,...,-0.047949,0.056129,0.006769,0.043736,0.039515,-0.021966,0.004106,0.023296,0.016896,-0.009466


In [32]:
T = pd.read_csv('testing_portfolios_median.csv')
T = T.drop(T.columns[0], axis=1)
T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,0.414944,-0.401407,-0.265623,-0.124719,-0.027988,-0.083522,-0.158461,-0.002962,-0.100082,-0.079790,...,-0.060503,-0.007134,-0.044819,-0.132437,-0.003314,0.021405,0.031614,-0.026372,-0.004882,-0.029343
1,0.487281,-0.427202,-0.227643,-0.289671,-0.108187,0.137409,-0.182159,-0.076565,0.009362,-0.117430,...,-0.009839,-0.078557,-0.008140,0.029593,0.008215,0.037188,0.089943,-0.041670,-0.042328,-0.037412
2,-0.764856,-0.388884,-0.297688,-0.136240,-0.040044,-0.020258,-0.089545,0.028512,0.022879,0.048552,...,-0.003791,-0.033162,0.001787,-0.088309,-0.014585,0.033484,-0.028766,0.019207,-0.054666,-0.001369
3,-0.671117,-0.402407,-0.342447,-0.094016,0.048151,0.186040,-0.199537,0.031048,-0.083252,-0.016426,...,0.009672,0.009754,0.027996,-0.090663,-0.062254,-0.011017,-0.015657,-0.020701,-0.030709,0.025244
4,0.701175,-0.449857,-0.306889,-0.097079,0.009964,-0.026817,-0.185084,-0.151040,-0.058367,0.042697,...,0.028657,-0.018686,0.003478,-0.079171,0.059253,-0.145750,0.026741,0.034051,0.042729,0.000450
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
325,0.755276,-0.427588,-0.383248,-0.147788,-0.066962,-0.089155,-0.108481,-0.087863,0.053564,-0.100000,...,-0.089448,0.058053,0.069615,-0.083665,-0.012232,-0.028937,0.043440,0.019957,-0.049998,-0.020238
326,-0.654412,-0.458353,-0.296997,-0.085277,-0.013907,0.136378,-0.168647,-0.042308,0.001937,-0.033238,...,-0.003761,-0.077820,-0.031796,0.051043,-0.154347,0.001892,0.011314,-0.012502,-0.035646,-0.034492
327,-0.823368,-0.445005,-0.336572,-0.135162,-0.038062,0.034224,-0.104783,-0.020929,0.066877,-0.022600,...,-0.068450,-0.003380,-0.009735,0.037699,0.017754,-0.039506,0.020415,0.030796,-0.048259,-0.034015
328,0.611993,-0.424621,-0.207412,-0.191026,-0.035120,0.134583,-0.183118,-0.136972,0.081381,-0.020794,...,-0.109460,0.038521,0.011730,-0.109632,0.027563,0.096030,0.011691,0.013061,-0.035390,-0.047618


In [35]:
predictions = reg.predict(T)

In [38]:
output = []
for i in range(len(predictions)):
    id_val = 'portfolio_' + str(i+1)
    pred = predictions[i]
    output.append((id_val, pred))
output

[('portfolio_1', -0.49879348224783115),
 ('portfolio_2', -0.2030330625591823),
 ('portfolio_3', -0.28303874954135166),
 ('portfolio_4', -0.30272431774326203),
 ('portfolio_5', -0.68615963724777),
 ('portfolio_6', -0.07890805697627185),
 ('portfolio_7', -0.5324123688105366),
 ('portfolio_8', -0.5887253231644359),
 ('portfolio_9', -0.24353800865584516),
 ('portfolio_10', -0.6250811735350893),
 ('portfolio_11', 0.12680386698204693),
 ('portfolio_12', 0.011853569473855907),
 ('portfolio_13', 0.09265479306118513),
 ('portfolio_14', -0.6180135181979137),
 ('portfolio_15', -0.04116852377018354),
 ('portfolio_16', 0.15910797163113866),
 ('portfolio_17', 0.006486118994324108),
 ('portfolio_18', -0.27349243939518864),
 ('portfolio_19', 0.02393073980446503),
 ('portfolio_20', -0.794213937525527),
 ('portfolio_21', 0.19985442630566963),
 ('portfolio_22', 0.11550794975526568),
 ('portfolio_23', -0.21723216795098269),
 ('portfolio_24', 0.09830843388313848),
 ('portfolio_25', -0.39414973662307595),
 

In [40]:
op_pd = pd.DataFrame(output, columns=['ID','ln_LR'])

In [43]:
op_pd

Unnamed: 0,ID,ln_LR
0,portfolio_1,-0.498793
1,portfolio_2,-0.203033
2,portfolio_3,-0.283039
3,portfolio_4,-0.302724
4,portfolio_5,-0.686160
...,...,...
325,portfolio_326,-0.528591
326,portfolio_327,0.164213
327,portfolio_328,0.205070
328,portfolio_329,-0.815784


In [42]:
op_pd.to_csv('linear_regression_out.csv', index=False)

In [63]:
import glob
path = 'testing_portfolios' # use your path
all_files = glob.glob(path + "/*.csv")
files = []
for f in all_files:
    t = f[24:]
    t = t[:-4]
    files.append(t)
def sort_file(item):
    return int(item.split('_')[1])
files = sorted(files, key=sort_file)
files

['portfolio_1',
 'portfolio_2',
 'portfolio_3',
 'portfolio_4',
 'portfolio_5',
 'portfolio_6',
 'portfolio_7',
 'portfolio_8',
 'portfolio_9',
 'portfolio_10',
 'portfolio_11',
 'portfolio_12',
 'portfolio_13',
 'portfolio_14',
 'portfolio_15',
 'portfolio_16',
 'portfolio_17',
 'portfolio_18',
 'portfolio_19',
 'portfolio_20',
 'portfolio_21',
 'portfolio_22',
 'portfolio_23',
 'portfolio_24',
 'portfolio_25',
 'portfolio_26',
 'portfolio_27',
 'portfolio_28',
 'portfolio_29',
 'portfolio_30',
 'portfolio_41',
 'portfolio_42',
 'portfolio_43',
 'portfolio_44',
 'portfolio_45',
 'portfolio_46',
 'portfolio_47',
 'portfolio_48',
 'portfolio_49',
 'portfolio_50',
 'portfolio_51',
 'portfolio_52',
 'portfolio_53',
 'portfolio_54',
 'portfolio_55',
 'portfolio_56',
 'portfolio_57',
 'portfolio_58',
 'portfolio_59',
 'portfolio_60',
 'portfolio_61',
 'portfolio_62',
 'portfolio_63',
 'portfolio_64',
 'portfolio_65',
 'portfolio_66',
 'portfolio_67',
 'portfolio_68',
 'portfolio_69',
 'port

In [64]:
output = []
for i in range(len(predictions)):
    pred = predictions[i]
    output.append((files[i], pred))
output

[('portfolio_1', -0.49879348224783115),
 ('portfolio_2', -0.2030330625591823),
 ('portfolio_3', -0.28303874954135166),
 ('portfolio_4', -0.30272431774326203),
 ('portfolio_5', -0.68615963724777),
 ('portfolio_6', -0.07890805697627185),
 ('portfolio_7', -0.5324123688105366),
 ('portfolio_8', -0.5887253231644359),
 ('portfolio_9', -0.24353800865584516),
 ('portfolio_10', -0.6250811735350893),
 ('portfolio_11', 0.12680386698204693),
 ('portfolio_12', 0.011853569473855907),
 ('portfolio_13', 0.09265479306118513),
 ('portfolio_14', -0.6180135181979137),
 ('portfolio_15', -0.04116852377018354),
 ('portfolio_16', 0.15910797163113866),
 ('portfolio_17', 0.006486118994324108),
 ('portfolio_18', -0.27349243939518864),
 ('portfolio_19', 0.02393073980446503),
 ('portfolio_20', -0.794213937525527),
 ('portfolio_21', 0.19985442630566963),
 ('portfolio_22', 0.11550794975526568),
 ('portfolio_23', -0.21723216795098269),
 ('portfolio_24', 0.09830843388313848),
 ('portfolio_25', -0.39414973662307595),
 

In [65]:
op_pd = pd.DataFrame(output, columns=['ID','ln_LR'])
op_pd.to_csv('linear_regression_out.csv', index=False)
op_pd

Unnamed: 0,ID,ln_LR
0,portfolio_1,-0.498793
1,portfolio_2,-0.203033
2,portfolio_3,-0.283039
3,portfolio_4,-0.302724
4,portfolio_5,-0.686160
...,...,...
325,portfolio_596,-0.528591
326,portfolio_597,0.164213
327,portfolio_598,0.205070
328,portfolio_599,-0.815784


In [9]:
portfolios

[               0         1         2         3         4         5         6  \
 189654 -4.751556 -0.114789 -1.147279 -1.110383  0.137843  0.929312 -0.351165   
 401448 -0.690923  1.641516 -0.134816 -1.460797 -0.543577 -2.626249  2.406499   
 48321  -0.955955 -2.832010 -1.809924  1.231409  1.641113 -0.633431 -1.848302   
 393897 -1.056322  8.559137 -2.256912  1.285432  3.527938  1.746366  6.758357   
 12276   0.276857 -2.967412 -3.621377  0.564532  0.066668 -0.246812  0.338766   
 ...          ...       ...       ...       ...       ...       ...       ...   
 198371  1.045297 -0.553109  0.329827  0.639911 -2.139931 -0.878772  0.234385   
 133110 -2.535938 -2.770120 -1.700924  0.574361  1.171619  0.974007 -0.398681   
 115393 -1.973842 -0.549395  0.364740  0.729369 -2.615029 -1.997011  0.855412   
 331088 -5.002817 -0.422438 -1.427130 -0.733606  0.227056  2.802969  0.403038   
 54532   2.320943 -2.079204 -2.004427 -0.049774 -0.491710  0.363569 -0.321569   
 
                7         