In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn import preprocessing

In [2]:
# read traning data
df = pd.read_csv("training_data.csv", low_memory=False)
start_index = list(df.columns).index('Claim_Count')

In [3]:
target = df['Loss_Amount']
df = df.drop(df.columns[start_index:], axis=1)
df

Unnamed: 0,PolicyNo,Policy_Company,Policy_Installment_Term,Policy_Billing_Code,Policy_Method_Of_Payment,Policy_Reinstatement_Fee_Indicator,Policy_Zip_Code_Garaging_Location,Vehicle_Territory,Vehicle_Make_Year,Vehicle_Make_Description,...,EEA_Policy_Zip_Code_3,EEA_Policy_Tenure,EEA_Agency_Type,EEA_Packaged_Policy_Indicator,EEA_Full_Coverage_Indicator,EEA_Prior_Bodily_Injury_Limit,EEA_PolicyYear,SYS_Renewed,SYS_New_Business,Annual_Premium
0,164532941,Standard,6,Direct Billed to Insured,Pre-paid,N,43046,35,2004,BUIK LESABRE LI,...,430,22.7,Standard,N,Y,100-400,2006,Y,N,320.12
1,164533241,Standard,6,Direct Billed to Insured,Pre-paid,N,Unknown,35,1980,CADILLAC 4-DOOR,...,Unknown,47.1,Preferred,N,Y,100-200,2006,Y,N,259.70
2,164534633,Standard,6,Direct Billed to Insured,Pre-paid,N,43555,17,2005,PONT MONTANA SV,...,435,47.2,Non-standard,N,Y,100-400,2006,Y,N,613.74
3,164534839,Standard,6,Direct Billed to Insured,Pre-paid,N,43561,17,2005,MERC GRAND MARQ,...,435,46.7,Non-standard,Y,Y,40-100,2006,Y,N,541.66
4,164534840,Standard,6,Direct Billed to Insured,Pre-paid,N,43561,17,2005,MERC GRAND MARQ,...,435,47.2,Non-standard,Y,Y,40-100,2006,Y,N,541.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424426,381713000,Standard,6,Direct Billed to Insured,Pre-paid,N,42851,35,1999,PONT GR PRIX GT,...,428,0.0,Preferred,N,Y,,2006,Y,Y,162.55
424427,381735600,Standard,6,Direct Billed to Insured,Pre-paid,N,43669,31,2000,NSSN QUEST,...,436,0.0,Hybrid,Y,N,100-200,2006,Y,Y,117.13
424428,382057400,Standard,6,Direct Billed to Insured,Installment,N,42487,35,1997,PONT TRANSSPORT,...,424,0.0,Preferred,N,Y,100-400,2006,N,Y,118.21
424429,382162500,Preferred,6,Direct Billed to Insured,Installment,N,43360,31,1998,PONT SUNFIRE SE,...,433,0.0,Non-standard,N,N,40-100,2006,N,Y,103.93


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import copy

def drop_unknown(df):
    return df.dropna()

def label_encode(df):
    le = preprocessing.LabelEncoder()
    for col in df.columns:
        col = str(col)
        if str(df.loc[:,col].dtype) == 'object':
            le.fit(df.loc[:,col]) 
            df.loc[:,col] = le.transform(df.loc[:,col])
    return df

def clean_data(df, missing_handler=drop_unknown, data_encoder=label_encode):
    df = missing_handler(df)
    return data_encoder(df)

def pca(df, num_components):
    pca = PCA(num_components).fit(df)
    principal_cols = pca.transform(df)
    return pd.DataFrame(data = principal_cols)
    

def preprocess_data(df, **params):
    """
    Preprocesses dataframe, with customizable options.
    
    params:
        dropped_columns[list]: list of columns to be dropped before preprocessing begins.
    
        clean[Boolean]: should the data be cleaned.
        
        missing_handler[function(dataframe) returns dataframe]: how to handle missing data,
                       'dropna' by default.
                                             
        data_encoder[function(dataframe) returns dataframe]: specifies encoder for data, 
                    'label encoding' is the default.
        
        feature_transform[Boolean]: does feature transformation need to be performed.
        
        feature_transformer[function(dataframe) returns dataframe]: specifies the feature transformer,
                        'Standardization' is the default.
                                          
        feature_selection[Boolean]: does feature selection need to be performed.
        
        feature_selector[function(dataframe) returns dataframe]: specifies the feature selector,
                        'PCA' is the default.
                        
        num_components[int]: specifies the number of principal components we want.
                                 Default is 30 components.
        
    NOTE: all boolean params are set to 'False' by default.
          So the call 'preprocessor()' does nothing, because no arguments are passed.
    """
    drop_cols = params.get('dropped_columns', [])
    df = df.drop(drop_cols, axis=1)
    
    do_clean = params.get('clean', False)
    if do_clean:
        missing_handler = params.get('missing_handler', drop_unknown)
        data_encoder = params.get('data_encoder', label_encode)
        df = clean_data(df, missing_handler, data_encoder)
    
    do_transform = params.get('feature_transform', False)
    if do_transform:
        transformer = params.get('feature_transformer', StandardScaler().fit_transform)
        scaled_features = transformer(df)
        df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
        
    do_feature_selection = params.get('feature_selection', False)
    num_components = params.get('num_components', 30)
    if do_feature_selection:
        feature_selector = params.get('feature_selector', pca)
        df = feature_selector(df, num_components)
    
    return df

In [5]:
df_clean = preprocess_data(df, dropped_columns=['PolicyNo'], clean=True, remove_outliers=False,
                           feature_transform=True, feature_selection=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [6]:
df_clean['loss'] = target
premiums = preprocess_data(df, dropped_columns=['PolicyNo'], clean=True, remove_outliers=False,
                           feature_transform=False, feature_selection=False)
df_clean['premium'] = premiums['Annual_Premium']

df_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,loss,premium
0,0.822511,-3.197912,-3.683211,0.685341,-0.345191,1.083720,-0.361839,1.616298,0.571414,-0.318968,...,-0.225479,0.049694,0.646965,-0.011226,0.260335,-1.716398,0.075115,0.119051,0.0,320.12
1,-2.367811,-3.625520,-3.769197,2.885160,-1.726932,2.068174,-2.432204,-1.351375,6.597779,2.291313,...,-0.239275,0.908428,-0.495519,-1.679050,-0.310589,-2.643808,2.066828,0.049289,0.0,259.70
2,0.955760,-3.940035,-3.657852,1.559602,1.220831,2.289418,-1.631374,1.451127,2.440726,-0.465232,...,-0.506011,-2.386457,0.330048,2.456191,1.932833,0.672339,-3.022418,0.502547,0.0,613.74
3,1.100195,-3.617575,-2.599070,1.138101,0.308150,3.215075,-2.428926,-1.190582,0.816930,-1.259976,...,-0.054936,-1.065084,-0.672977,1.681168,-0.744429,0.545174,-1.885164,1.851894,0.0,541.66
4,1.085182,-3.670813,-2.632129,1.162123,0.307174,3.232386,-2.439885,-1.177125,0.828603,-1.279918,...,-0.054862,-1.066153,-0.687041,1.682547,-0.752325,0.537366,-1.887535,1.864416,0.0,541.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407100,2.198991,6.174034,-1.849264,0.507698,1.200215,1.029547,0.066071,-0.779448,-0.621973,0.776289,...,-0.233778,0.800901,-2.260693,0.495701,0.750217,-1.302709,-0.199354,-0.380128,0.0,139.33
407101,4.085266,0.511352,0.851616,-2.138278,4.629800,1.397410,6.594285,-1.886302,2.365930,-1.686289,...,-0.064696,-0.390817,0.120924,1.184329,-0.554729,0.364847,0.233400,-1.149891,0.0,401.74
407102,-0.783450,6.658739,-2.401967,2.054313,-0.406713,-1.430085,0.818531,-0.201564,0.341649,1.378295,...,0.244230,0.556948,-0.959805,-1.103679,-0.096346,-0.416120,-0.301702,-2.538797,0.0,498.78
407103,0.209534,-1.043601,-0.581529,-0.583814,1.830809,-0.378109,1.548187,0.958845,-2.732461,1.567617,...,-0.400173,-1.064372,-1.908202,2.206221,1.568840,0.873562,-1.490431,-1.436435,0.0,200.34


In [7]:
import random

def make_portfolios(df):
    with_claims = []
    no_claims = []
    for i, r in enumerate(df['loss']):
        if r > 0.0:
            with_claims.append(i)
        else:
            no_claims.append(i)
            
    random.shuffle(with_claims)
    random.shuffle(no_claims)
    
    ratio = int(len(no_claims)/len(with_claims))
    per_portfolio = int(1000/ratio)
    
    possible_claims_portfolios = int(len(with_claims)/(per_portfolio))
    possible_noclaims_portfolios = int(len(no_claims)/(per_portfolio*ratio))
    
    num_portfolios = min(possible_claims_portfolios, possible_noclaims_portfolios)
    
    portfolios = []
    for i in range(num_portfolios):
        portfolio = []
        for j in range(per_portfolio):
            for k in range(ratio):
                portfolio.append(df.iloc[no_claims.pop()])
            portfolio.append(df.iloc[with_claims.pop()])
            random.shuffle(portfolio)
        portfolios.append(pd.DataFrame(portfolio, columns=df.columns))    
        
    return portfolios

In [8]:
portfolios = make_portfolios(df_clean)

In [9]:
def mean_summarizer(df):
    values = df.mean(axis = 0)
    n_df = pd.DataFrame([], columns=df.columns)
    return n_df.append(values, ignore_index=True)    

def summarize_portfolios(portfolios, summarizer=mean_summarizer, convert_to_lr=False):
    summarized_portfolios = []
    for p in portfolios:
        portfolio = summarizer(p)
        if convert_to_lr:
            portfolio['lr'] = np.log(portfolio['loss']/portfolio['premium'])
            portfolio = portfolio.drop(['loss', 'premium'], axis=1)
        cols = portfolio.columns
        summarized_portfolios.append(portfolio)
        
    if cols is None:
        return None
    n_df = pd.DataFrame([], columns=cols)
    for p in summarized_portfolios:
        n_df = n_df.append(p, ignore_index=True)
    return n_df

In [37]:
X = summarize_portfolios(portfolios, convert_to_lr=True)
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,lr
0,-0.037961,-0.001598,-0.001005,-0.045195,0.088866,-0.095893,-0.016927,0.015003,-0.021913,-0.005449,...,0.030419,0.009047,-0.012006,-0.015928,-0.018580,-0.044953,0.038502,0.011080,-0.011377,0.123495
1,0.121835,-0.064778,-0.105073,-0.005829,-0.047053,0.062633,-0.045996,-0.052601,-0.037312,-0.032611,...,-0.008828,-0.028623,0.018662,-0.017561,0.065023,0.063765,-0.006464,-0.009273,0.002536,-0.413418
2,0.096128,-0.020731,0.067253,-0.036752,0.034826,-0.006414,-0.046352,-0.017591,-0.045438,-0.047662,...,0.032605,0.021092,0.038430,0.003407,-0.022863,0.050163,-0.036240,-0.007693,0.031006,-0.426502
3,-0.209502,-0.040296,0.018543,0.047315,-0.036666,0.022311,0.022533,0.001120,0.080825,-0.073504,...,-0.046378,0.121696,0.086621,0.001664,0.027304,0.048353,0.000737,0.053749,-0.010263,-0.643502
4,-0.164122,-0.056598,-0.057266,0.013974,0.046673,-0.028587,0.021109,-0.011188,0.022671,0.042694,...,-0.046737,0.026634,-0.027376,0.020161,0.008003,-0.034059,-0.016821,-0.029803,-0.043981,-0.286422
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
375,0.060622,0.019379,0.055759,0.050230,0.056552,-0.017690,-0.077933,0.064448,0.018010,-0.035324,...,-0.041521,0.027211,-0.024629,0.007927,0.059323,-0.006034,0.014351,0.002006,0.043377,-0.795185
376,-0.145228,-0.009690,0.081558,-0.010548,-0.040466,0.008460,0.000414,0.025426,-0.009484,0.020072,...,-0.012009,-0.021246,-0.022937,-0.035192,-0.034295,-0.029842,0.010413,0.008394,-0.014418,-0.954335
377,0.000373,0.134326,0.012680,-0.030979,0.042176,-0.076656,0.015725,0.017132,0.026691,-0.004911,...,0.039799,0.018703,-0.000972,0.013019,0.025203,-0.021176,0.007694,-0.009220,-0.003413,-0.312598
378,0.031226,0.122849,0.022453,0.006468,0.078451,0.068504,0.051040,-0.069796,0.055971,0.003476,...,-0.007658,-0.015954,-0.034756,-0.050853,0.007012,-0.002718,-0.010038,0.023502,0.006256,-0.545968


In [9]:
cutoff = int(len(X)*0.8)
Test = X[cutoff:]
Train = X[:cutoff]

NameError: name 'X' is not defined

In [None]:
T = Train[]
X = Train.drop()

In [1]:
import glob

# path = 'testing_portfolios' # use your path
# all_files = glob.glob(path + "/*.csv")
# #print(all_files)
# # d = []
# # i = 0
# portfolios = []
# for filename in all_files: 
#     #d[f'df_testing{i}'] = pd.read_csv(filename, index_col=None, header=0)
# #     globals()['df_testing%s' % i] = pd.read_csv(filename, index_col=None, header=0)
# #     filename = globals()['df_testing%s' % i]
# #     drop_unknown(filename)
# #     i = i + 1
#     portfolios.append(pd.read_csv(filename, index_col=None, header=0))

def get_testing_portfolios(folder_name):
    path = 'testing_portfolios' # use your path
    all_files = glob.glob(path + "/*.csv")
    portfolios = []
    for filename in all_files:
        p = pd.read_csv(filename, index_col=None, header=0)
        p = preprocess_data(df, dropped_columns=['PolicyNo'], clean=True, remove_outliers=False,
                            feature_transform=True, feature_selection=True)
        portfolios.append(p)
    return summarize_portfolios(portfolios)

In [2]:
get_testing_portfolios('testing_portfolios')

NameError: name 'pd' is not defined