In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn import preprocessing

In [5]:
# read traning data
df = pd.read_csv("training_data.csv", low_memory=False)
start_index = list(df.columns).index('Claim_Count')

In [6]:
target = df['Loss_Amount']
df = df.drop(df.columns[start_index:], axis=1)
df

NameError: name 'avg' is not defined

In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
import copy

def drop_unknown(df):
    return df.dropna()

def label_encode(df):
    le = preprocessing.LabelEncoder()
    for col in df.columns:
        col = str(col)
        if str(df.loc[:,col].dtype) == 'object':
            le.fit(df.loc[:,col]) 
            df.loc[:,col] = le.transform(df.loc[:,col])
    return df

def clean_data(df, missing_handler=drop_unknown, data_encoder=label_encode):
    df = missing_handler(df)
    return data_encoder(df)

def pca(df, num_components):
    pca = PCA(num_components).fit(df)
    principal_cols = pca.transform(df)
    return pd.DataFrame(data = principal_cols)
    

def preprocess_data(df, **params):
    """
    Preprocesses dataframe, with customizable options.
    
    params:
        dropped_columns[list]: list of columns to be dropped before preprocessing begins.
    
        clean[Boolean]: should the data be cleaned.
        
        missing_handler[function(dataframe) returns dataframe]: how to handle missing data,
                       'dropna' by default.
                                             
        data_encoder[function(dataframe) returns dataframe]: specifies encoder for data, 
                    'label encoding' is the default.
        
        feature_transform[Boolean]: does feature transformation need to be performed.
        
        feature_transformer[function(dataframe) returns dataframe]: specifies the feature transformer,
                        'Standardization' is the default.
                                          
        feature_selection[Boolean]: does feature selection need to be performed.
        
        feature_selector[function(dataframe) returns dataframe]: specifies the feature selector,
                        'PCA' is the default.
                        
        num_components[int]: specifies the number of principal components we want.
                                 Default is 30 components.
        
    NOTE: all boolean params are set to 'False' by default.
          So the call 'preprocessor()' does nothing, because no arguments are passed.
    """
    drop_cols = params.get('dropped_columns', [])
    df = df.drop(drop_cols, axis=1)
    
    do_clean = params.get('clean', False)
    if do_clean:
        missing_handler = params.get('missing_handler', drop_unknown)
        data_encoder = params.get('data_encoder', label_encode)
        df = clean_data(df, missing_handler, data_encoder)
    
    do_transform = params.get('feature_transform', False)
    if do_transform:
        transformer = params.get('feature_transformer', StandardScaler().fit_transform)
        scaled_features = transformer(df)
        df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)
        
    do_feature_selection = params.get('feature_selection', False)
    num_components = params.get('num_components', 30)
    if do_feature_selection:
        feature_selector = params.get('feature_selector', pca)
        df = feature_selector(df, num_components)
    
    return df

In [10]:
df_clean = preprocess_data(df, dropped_columns=['PolicyNo'], clean=True, feature_transform=True,
                           feature_selection=True)
df_clean

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,0.785450,-3.209023,3.674657,0.708684,-0.123389,-0.351678,-1.078479,0.355020,-1.613381,0.557680,...,1.473379,0.215753,0.435169,0.888003,-0.139465,0.126721,0.467495,-0.373731,0.669415,0.102986
1,-2.402349,-3.605650,3.758536,2.903636,0.004053,-1.741795,-2.083471,2.408650,1.336873,6.600749,...,-0.834658,0.090615,0.834899,1.391255,-0.622428,-0.235271,-1.643442,-0.533840,-0.118106,0.420246
2,0.913796,-3.951205,3.648198,1.581371,-0.020146,1.205139,-2.307780,1.610967,-1.452347,2.427298,...,-3.559137,0.408827,1.502663,2.234458,-0.750039,-0.503332,1.097267,-0.882805,-1.433142,0.217913
3,1.058758,-3.631684,2.589896,1.154251,-0.040150,0.287273,-3.239068,2.394210,1.192992,0.813500,...,-0.900659,-0.167007,1.325234,2.076389,-1.117896,-0.030729,0.776766,-0.745942,-1.501702,0.018583
4,1.043411,-3.684705,2.623003,1.178389,-0.037333,0.286129,-3.256346,2.405084,1.179504,0.825138,...,-0.909805,-0.178503,1.335868,2.092105,-1.148585,-0.032050,0.778184,-0.747565,-1.517703,0.025968
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407100,2.228527,6.141460,1.818947,0.529052,-0.394539,1.196978,-1.041689,-0.080872,0.786084,-0.629960,...,-0.292738,-1.096991,1.615012,-1.080812,-0.221830,0.346417,-0.739242,0.012085,-0.369013,1.771514
407101,4.070448,0.465518,-0.851061,-2.137626,0.040404,4.615673,-1.363630,-6.618264,1.866449,2.352853,...,-0.491736,-0.097001,0.283760,0.279784,-0.415981,-0.371176,2.148225,-0.083990,-0.686483,0.658414
407102,-0.747832,6.652316,2.363396,2.077997,-0.546127,-0.391419,1.419286,-0.815882,0.201909,0.336397,...,0.548651,-1.202113,1.914405,-1.051068,-0.163129,-1.086661,1.184296,-1.342895,0.480556,1.343198
407103,0.189861,-1.050174,0.583332,-0.578568,0.049488,1.831582,0.400301,-1.530487,-0.942854,-2.739093,...,-3.702581,0.176425,0.278666,0.277866,-0.601770,-0.885833,0.521046,1.518777,-0.459026,0.182170


In [8]:
# TODO: make portfolios