In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn import preprocessing

In [2]:
# read traning data
df = pd.read_csv("training_data.csv", low_memory=False)
start_index = list(df.columns).index('Claim_Count')

In [3]:
target = df['Loss_Amount']
df = df.drop(df.columns[start_index:], axis=1)
df

Unnamed: 0,PolicyNo,Policy_Company,Policy_Installment_Term,Policy_Billing_Code,Policy_Method_Of_Payment,Policy_Reinstatement_Fee_Indicator,Policy_Zip_Code_Garaging_Location,Vehicle_Territory,Vehicle_Make_Year,Vehicle_Make_Description,...,EEA_Policy_Zip_Code_3,EEA_Policy_Tenure,EEA_Agency_Type,EEA_Packaged_Policy_Indicator,EEA_Full_Coverage_Indicator,EEA_Prior_Bodily_Injury_Limit,EEA_PolicyYear,SYS_Renewed,SYS_New_Business,Annual_Premium
0,164532941,Standard,6,Direct Billed to Insured,Pre-paid,N,43046,35,2004,BUIK LESABRE LI,...,430,22.7,Standard,N,Y,100-400,2006,Y,N,320.12
1,164533241,Standard,6,Direct Billed to Insured,Pre-paid,N,Unknown,35,1980,CADILLAC 4-DOOR,...,Unknown,47.1,Preferred,N,Y,100-200,2006,Y,N,259.70
2,164534633,Standard,6,Direct Billed to Insured,Pre-paid,N,43555,17,2005,PONT MONTANA SV,...,435,47.2,Non-standard,N,Y,100-400,2006,Y,N,613.74
3,164534839,Standard,6,Direct Billed to Insured,Pre-paid,N,43561,17,2005,MERC GRAND MARQ,...,435,46.7,Non-standard,Y,Y,40-100,2006,Y,N,541.66
4,164534840,Standard,6,Direct Billed to Insured,Pre-paid,N,43561,17,2005,MERC GRAND MARQ,...,435,47.2,Non-standard,Y,Y,40-100,2006,Y,N,541.66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424426,381713000,Standard,6,Direct Billed to Insured,Pre-paid,N,42851,35,1999,PONT GR PRIX GT,...,428,0.0,Preferred,N,Y,,2006,Y,Y,162.55
424427,381735600,Standard,6,Direct Billed to Insured,Pre-paid,N,43669,31,2000,NSSN QUEST,...,436,0.0,Hybrid,Y,N,100-200,2006,Y,Y,117.13
424428,382057400,Standard,6,Direct Billed to Insured,Installment,N,42487,35,1997,PONT TRANSSPORT,...,424,0.0,Preferred,N,Y,100-400,2006,N,Y,118.21
424429,382162500,Preferred,6,Direct Billed to Insured,Installment,N,43360,31,1998,PONT SUNFIRE SE,...,433,0.0,Non-standard,N,N,40-100,2006,N,Y,103.93


In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import copy

def drop_unknown(df):
    return df.dropna()

def label_encode(df):
    le = preprocessing.LabelEncoder()
    for col in df.columns:
        col = str(col)
        if str(df.loc[:,col].dtype) == 'object':
            le.fit(df.loc[:,col]) 
            df.loc[:,col] = le.transform(df.loc[:,col])
    return df

def clean_data(df, missing_handler=drop_unknown, data_encoder=label_encode):
    df = missing_handler(df)
    return data_encoder(df)

def pca(df, num_components):
    pca = PCA(num_components).fit(df)
    principal_cols = pca.transform(df)
    return pd.DataFrame(data = principal_cols)
    

def preprocess_data(df, **params):
    """
    Preprocesses dataframe, with customizable options.
    
    params:
        dropped_columns[list]: list of columns to be dropped before preprocessing begins.
    
        clean[Boolean]: should the data be cleaned.
        
        missing_handler[function(dataframe) returns dataframe]: how to handle missing data,
                       'dropna' by default.
                                             
        data_encoder[function(dataframe) returns dataframe]: specifies encoder for data, 
                    'label encoding' is the default.
        
        feature_transform[Boolean]: does feature transformation need to be performed.
        
        feature_transformer[function(dataframe) returns dataframe]: specifies the feature transformer,
                        'Standardization' is the default.
                                          
        feature_selection[Boolean]: does feature selection need to be performed.
        
        feature_selector[function(dataframe) returns dataframe]: specifies the feature selector,
                        'PCA' is the default.
                        
        num_components[int]: specifies the number of principal components we want.
                                 Default is 30 components.
        
    NOTE: all boolean params are set to 'False' by default.
          So the call 'preprocessor()' does nothing, because no arguments are passed.
    """
    drop_cols = params.get('dropped_columns', [])
    df = df.drop(drop_cols, axis=1)
    
    do_clean = params.get('clean', False)
    if do_clean:
        missing_handler = params.get('missing_handler', drop_unknown)
        data_encoder = params.get('data_encoder', label_encode)
        df = clean_data(df, missing_handler, data_encoder)
    
    do_transform = params.get('feature_transform', False)
    if do_transform:
        transformer = params.get('feature_transformer', StandardScaler().fit_transform)
        scaled_features = transformer(df)
        df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
        
    do_feature_selection = params.get('feature_selection', False)
    num_components = params.get('num_components', 30)
    if do_feature_selection:
        feature_selector = params.get('feature_selector', pca)
        df = feature_selector(df, num_components)
    
    return df

In [5]:
df_clean = preprocess_data(df, dropped_columns=['PolicyNo'], clean=True, remove_outliers=False,
                           feature_transform=True, feature_selection=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [6]:
df_clean['loss'] = target
df_clean

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,21,22,23,24,25,26,27,28,29,loss
0,0.822512,-3.197911,-3.683272,0.685334,-0.345120,1.083265,-0.362057,1.616316,0.570970,-0.318748,...,0.168577,-0.159308,0.057396,0.659582,0.007497,0.201477,-1.768407,0.074517,0.063061,0.0
1,-2.367807,-3.625517,-3.769229,2.885079,-1.726842,2.068458,-2.431455,-1.350882,6.596722,2.287766,...,0.174616,-0.151192,0.878028,-0.486768,-1.626945,-0.310712,-2.602338,2.058319,-0.092571,0.0
2,0.955764,-3.940031,-3.658030,1.559581,1.220792,2.288280,-1.632134,1.450470,2.440029,-0.465329,...,-0.086555,-0.451773,-2.446421,0.347985,2.493414,1.970037,0.592045,-3.015070,0.531439,0.0
3,1.100199,-3.617571,-2.599245,1.138075,0.308156,3.213984,-2.429607,-1.191113,0.816244,-1.260262,...,-0.120013,-0.074620,-1.037624,-0.651313,1.728663,-0.719021,0.477918,-1.868560,1.856447,0.0
4,1.085186,-3.670810,-2.632306,1.162097,0.307180,3.231289,-2.440567,-1.177664,0.827907,-1.280233,...,-0.120304,-0.074553,-1.038495,-0.665013,1.730620,-0.726906,0.469812,-1.870727,1.868735,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407100,2.198991,6.174033,-1.849245,0.507640,1.200358,1.029910,0.066741,-0.778967,-0.622625,0.775349,...,0.240331,-0.068722,0.736618,-2.272715,0.504380,0.725277,-1.304343,-0.212136,-0.411499,0.0
407101,4.085267,0.511352,0.851455,-2.138261,4.629861,1.396407,6.593444,-1.887147,2.365020,-1.683857,...,-0.022266,-0.159400,-0.334297,0.142565,1.196258,-0.593204,0.325850,0.248993,-1.134499,0.0
407102,-0.783450,6.658740,-2.402093,2.054315,-0.406606,-1.431159,0.817837,-0.201741,0.340881,1.380956,...,-0.128257,0.147907,0.613055,-0.953344,-1.080421,-0.126739,-0.460774,-0.295326,-2.549707,0.0
407103,0.209534,-1.043602,-0.581554,-0.583859,1.830651,-0.377482,1.548539,0.957357,-2.733455,1.565038,...,0.085299,-0.246390,-1.189465,-1.893553,2.208883,1.610564,0.934363,-1.487632,-1.352534,0.0


In [32]:
import random

def make_portfolios(df):
    with_claims = []
    no_claims = []
    for i, r in enumerate(df['loss']):
        if r > 0.0:
            with_claims.append(i)
        else:
            no_claims.append(i)
            
    random.shuffle(with_claims)
    random.shuffle(no_claims)
    
    ratio = int(len(no_claims)/len(with_claims))
    per_portfolio = int(1000/ratio)
    
    possible_claims_portfolios = int(len(with_claims)/(per_portfolio))
    possible_noclaims_portfolios = int(len(no_claims)/(per_portfolio*ratio))
    
    num_portfolios = min(possible_claims_portfolios, possible_noclaims_portfolios)
    
    portfolios = []
    for i in range(num_portfolios):
        portfolio = []
        for j in range(per_portfolio):
            for k in range(ratio):
                portfolio.append(df.iloc[no_claims.pop()])
            portfolio.append(df.iloc[with_claims.pop()])
            random.shuffle(portfolio)
        portfolios.append(pd.DataFrame(portfolio, columns=df.columns))    
        
    return portfolios
    
portfolios = make_portfolios(df_clean)

In [None]:
for p in portfolios:
    p