In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn import preprocessing

In [2]:
# read traning data
df = pd.read_csv("training_data.csv", low_memory=False)

In [14]:
df

Unnamed: 0,Policy_Company,Policy_Installment_Term,Policy_Billing_Code,Policy_Method_Of_Payment,Policy_Reinstatement_Fee_Indicator,Policy_Zip_Code_Garaging_Location,Vehicle_Territory,Vehicle_Make_Year,Vehicle_Make_Description,Vehicle_Performance,...,EEA_Prior_Bodily_Injury_Limit,EEA_PolicyYear,SYS_Renewed,SYS_New_Business,Annual_Premium,Claim_Count,Loss_Amount,Frequency,Severity,Loss_Ratio
0,Standard,6,Direct Billed to Insured,Pre-paid,N,43046,35,2004,BUIK LESABRE LI,Standard,...,100-400,2006,Y,N,320.12,0,0.0,0.0,0.0,0.0
1,Standard,6,Direct Billed to Insured,Pre-paid,N,Unknown,35,1980,CADILLAC 4-DOOR,Standard,...,100-200,2006,Y,N,259.70,0,0.0,0.0,0.0,0.0
2,Standard,6,Direct Billed to Insured,Pre-paid,N,43555,17,2005,PONT MONTANA SV,Standard,...,100-400,2006,Y,N,613.74,0,0.0,0.0,0.0,0.0
3,Standard,6,Direct Billed to Insured,Pre-paid,N,43561,17,2005,MERC GRAND MARQ,Standard,...,40-100,2006,Y,N,541.66,0,0.0,0.0,0.0,0.0
4,Standard,6,Direct Billed to Insured,Pre-paid,N,43561,17,2005,MERC GRAND MARQ,Standard,...,40-100,2006,Y,N,541.66,0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424426,Standard,6,Direct Billed to Insured,Pre-paid,N,42851,35,1999,PONT GR PRIX GT,Standard,...,,2006,Y,Y,162.55,0,0.0,0.0,0.0,0.0
424427,Standard,6,Direct Billed to Insured,Pre-paid,N,43669,31,2000,NSSN QUEST,Standard,...,100-200,2006,Y,Y,117.13,0,0.0,0.0,0.0,0.0
424428,Standard,6,Direct Billed to Insured,Installment,N,42487,35,1997,PONT TRANSSPORT,Standard,...,100-400,2006,N,Y,118.21,0,0.0,0.0,0.0,0.0
424429,Preferred,6,Direct Billed to Insured,Installment,N,43360,31,1998,PONT SUNFIRE SE,Standard,...,40-100,2006,N,Y,103.93,0,0.0,0.0,0.0,0.0


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

def drop_unknown(df):
    return df.dropna()

def label_encode(df):
    le = preprocessing.LabelEncoder()
    for col in df.columns:
        col = str(col)
        if str(df.loc[:,col].dtype) == 'object':
            le.fit(df.loc[:,col]) 
            df.loc[:,col] = le.transform(df.loc[:,col])
    return df

def clean_data(df, missing_handler=drop_unknown, data_encoder=label_encode):
    df = missing_handler(df)
    return data_encoder(df)

def pca(df):
    # TODO: implement PCA
    pass

def preprocess_data(df, **params):
    """
    Preprocesses dataframe, with customizable options.
    
    params:
        dropped_columns[list]: list of columns to be dropped before preprocessing begins.
    
        clean[Boolean]: should the data be cleaned.
        
        missing_handler[function(dataframe) returns dataframe]: how to handle missing data,
                       'dropna' by default.
                                             
        data_encoder[function(dataframe) returns dataframe]: specifies encoder for data, 
                    'label encoding' is the default.
        
        feature_transform[Boolean]: does feature transformation need to be performed.
        
        feature_transformer[function(dataframe) returns dataframe]: specifies the feature transformer,
                        'Standardization' is the default.
                                          
        feature_selection[Boolean]: does feature selection need to be performed.
        
        feature_selector[function(dataframe) returns dataframe]: specifies the feature selector,
                        'PCA' is the default.
        
    NOTE: all boolean params are set to 'False' by default.
          So the call 'preprocessor()' does nothing, because no arguments are passed.
    """
    drop_cols = params.get('dropped_columns', [])
    df = df.drop(drop_cols, axis=1)
    
    do_clean = params.get('clean', False)
    if do_clean:
        missing_handler = params.get('missing_handler', drop_unknown)
        data_encoder = params.get('data_encoder', label_encode)
        df = clean_data(df, missing_handler, data_encoder)
    
    do_transform = params.get('feature_transform', False)
    if do_transform:
        transformer = params.get('feature_transformer', StandardScaler().fit_transform)
        scaled_features = transformer(df)
        df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
        
    do_feature_selection = params.get('feature_selection', False)
    if do_feature_selection:
        feature_selector = params.get('feature_selector', pca)
        df = feature_selector(df)
    
    return df

In [16]:
df_clean = preprocess_data(df, dropped_columns=['PolicyNo'], clean=True, feature_transform=True)
df_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,Policy_Company,Policy_Installment_Term,Policy_Billing_Code,Policy_Method_Of_Payment,Policy_Reinstatement_Fee_Indicator,Policy_Zip_Code_Garaging_Location,Vehicle_Territory,Vehicle_Make_Year,Vehicle_Make_Description,Vehicle_Performance,...,EEA_Prior_Bodily_Injury_Limit,EEA_PolicyYear,SYS_Renewed,SYS_New_Business,Annual_Premium,Claim_Count,Loss_Amount,Frequency,Severity,Loss_Ratio
0,0.259673,-0.194320,-0.147756,1.015904,-0.367509,-0.042381,0.944394,0.986964,-1.635780,0.210534,...,-0.575711,0.0,0.358341,-0.334975,-0.063304,-0.20946,-0.069585,-0.069311,-0.068277,-0.020777
1,0.259673,-0.194320,-0.147756,1.015904,-0.367509,2.498220,0.944394,-2.258697,-1.561855,0.210534,...,-1.092869,0.0,0.358341,-0.334975,-0.357729,-0.20946,-0.069585,-0.069311,-0.068277,-0.020777
2,0.259673,-0.194320,-0.147756,1.015904,-0.367509,0.695123,-3.336346,1.122200,1.393618,0.210534,...,-0.575711,0.0,0.358341,-0.334975,1.367499,-0.20946,-0.069585,-0.069311,-0.068277,-0.020777
3,0.259673,-0.194320,-0.147756,1.015904,-0.367509,0.709144,-3.336346,1.122200,0.939362,0.210534,...,1.492923,0.0,0.358341,-0.334975,1.016255,-0.20946,-0.069585,-0.069311,-0.068277,-0.020777
4,0.259673,-0.194320,-0.147756,1.015904,-0.367509,0.709144,-3.336346,1.122200,0.939362,0.210534,...,1.492923,0.0,0.358341,-0.334975,1.016255,-0.20946,-0.069585,-0.069311,-0.068277,-0.020777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
424424,0.259673,-0.194320,-0.147756,1.015904,-0.367509,0.588563,-0.006882,0.581256,-0.515178,0.210534,...,-0.058552,0.0,-2.790634,2.985295,1.542000,-0.20946,-0.069585,-0.069311,-0.068277,-0.020777
424425,0.259673,5.146159,6.767898,1.015904,-0.367509,-1.060304,1.182213,1.392672,1.175412,0.210534,...,-1.092869,0.0,0.358341,2.985295,1.014306,-0.20946,-0.069585,-0.069311,-0.068277,-0.020777
424427,0.259673,-0.194320,-0.147756,1.015904,-0.367509,0.829724,-0.006882,0.446021,1.159607,0.210534,...,-1.092869,0.0,0.358341,2.985295,-1.052469,-0.20946,-0.069585,-0.069311,-0.068277,-0.020777
424428,0.259673,-0.194320,-0.147756,-0.984345,-0.367509,-1.307073,0.944394,0.040313,1.415541,0.210534,...,-0.575711,0.0,-2.790634,2.985295,-1.047206,-0.20946,-0.069585,-0.069311,-0.068277,-0.020777
