In [1]:
#import warnings
#warnings.filterwarnings("ignore")

In [2]:
#load primary data

def load_data(filename):
    
    import pandas as pd  
    dataset = pd.read_csv(filename)
    dataset = dataset.sample(frac=1)
    dataset.reset_index(drop=True, inplace=True) 
    
    return dataset

In [3]:
#check if any NA values in dataset

def check_NA(dataset):
    
    import pandas as pd
    import numpy as np    
    missing_value = dataset.isnull().sum().sum()
    
    return missing_value


In [4]:
#converting the target value to 1 and 0 

def adjust_target_value(dataset):
    
    dataset.Revenue[dataset.Revenue == True] = 1 
    dataset.Revenue[dataset.Revenue == False] = 0
    
    return dataset

In [5]:
#converting to one hot encoding

def one_of_c(dataset):
    
    import pandas as pd
    
    dataset['Month'] = pd.Categorical(dataset['Month'])
    monthDummy = pd.get_dummies(dataset['Month'], prefix = 'Month')
    dataset = pd.concat([dataset, monthDummy], axis=1)
    dataset.drop(['Month'], axis=1, inplace= True)


    dataset['OperatingSystems'] = pd.Categorical(dataset['OperatingSystems'])
    operatingSystemsDummy = pd.get_dummies(dataset['OperatingSystems'], prefix = 'OS')
    dataset = pd.concat([dataset, operatingSystemsDummy], axis=1)
    dataset.drop(['OperatingSystems'], axis=1, inplace= True)


    dataset['Browser'] = pd.Categorical(dataset['Browser'])
    browserDummy = pd.get_dummies(dataset['Browser'], prefix = 'Browser')
    dataset = pd.concat([dataset, browserDummy], axis=1)
    dataset.drop(['Browser'], axis=1, inplace= True)


    dataset['Region'] = pd.Categorical(dataset['Region'])
    regionDummy = pd.get_dummies(dataset['Region'], prefix = 'Region')
    dataset = pd.concat([dataset, regionDummy], axis=1)
    dataset.drop(['Region'], axis=1, inplace= True)


    dataset['TrafficType'] = pd.Categorical(dataset['TrafficType'])
    trafficTypeDummy = pd.get_dummies(dataset['TrafficType'], prefix = 'TrafficType')
    dataset = pd.concat([dataset, trafficTypeDummy], axis=1)
    dataset.drop(['TrafficType'], axis=1, inplace= True)


    dataset['VisitorType'] = pd.Categorical(dataset['VisitorType'])
    visitorTypeDummy = pd.get_dummies(dataset['VisitorType'], prefix = 'visitorType')
    dataset = pd.concat([dataset, operatingSystemsDummy], axis=1)
    dataset.drop(['VisitorType'], axis=1, inplace= True)


    dataset['Weekend'] = pd.Categorical(dataset['Weekend'])
    weekendDummy = pd.get_dummies(dataset['Weekend'], prefix = 'Weekend')
    dataset = pd.concat([dataset, weekendDummy], axis=1)
    dataset.drop(['Weekend'], axis=1, inplace= True)
    
    return dataset


In [6]:
#removing any duplicate columns

def adjust_columns(dataset):
    
    dataset = dataset.loc[:,~dataset.columns.duplicated()]
    
    return dataset

In [7]:
#seperating the features and target value for feature selction

def data_for_features(dataset):
    
    X = dataset.loc[:, dataset.columns != 'Revenue']  
    Y = dataset.iloc[:,dataset.columns == 'Revenue']
    
    return X,Y


In [8]:
#displaying importance of each feature in determining the target value

def feature_selection(X,Y):
    
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import chi2
    import pandas as pd
    
    #displaying top 40 features only (cant see all features at once)
    bestfeatures = SelectKBest(score_func=chi2, k=40)
        
    fit = bestfeatures.fit(X,Y)

    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(X.columns)
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Specs','Score']  
    
    #uncomment to visualise the top 40 features
    print("The top 40 features are")
    print(featureScores.nlargest(40,'Score'))  
    
    return 0

In [9]:
#seperating dataset into train and test

def train_test_split(dataset):

    dataset = dataset.iloc[:,[18,52,53,17,11,63,23,51,16,58,7,8,13,6,9,10]]
    
    
    from sklearn.model_selection import train_test_split
    dataTrain, dataTest = train_test_split(dataset, test_size=0.3)   

    dataTrain.reset_index(drop=True, inplace=True)    
    dataTest.reset_index(drop=True, inplace=True)   
    
    return dataTrain, dataTest

In [10]:
#oversampling the training dataset

def oversampling(dataset):
    
    import pandas as pd
    
    target_count = dataset.Revenue.value_counts()
    #print to see the initial counts
    print("Initial class counts")
    print('Class True:', target_count[1]) #how many true cases  
    print('Class False:', target_count[0]) #how many false cases

    count_class_0, count_class_1 = dataset.Revenue.value_counts()
    
    df_class_true = dataset[dataset['Revenue'] == 1]
    df_class_false = dataset[dataset['Revenue'] == 0]

    df_class_true_over = df_class_true.sample(count_class_0, replace=True)
    df_test_over = pd.concat([df_class_false, df_class_true_over], axis=0)

    dataset = df_test_over
    count_true, count_false = dataset.Revenue.value_counts()
    
    #print to see the final counts
    print("Class counts after oversampling")
    print("Class True: ", count_true)
    print("Class False: ", count_false)
    
    return dataset
    

In [11]:
#save the databases for easy access next time

def saving_datasets(dataTrain, dataTest):
    
    dataTrain.to_csv('MLPTrainingData.csv')
    dataTest.to_csv('MLPTestingData.csv')
    
    return 0

In [12]:
data = load_data("predictionDataset")
if(data.empty==False):
    print("Dataset loaded successfully")

    
missing_value = check_NA(data)
if(missing_value==0):
    print("No missing data")
    

data = adjust_target_value(data)
if(data.empty==False):
    print("Target value converted successfully")
    

data = one_of_c(data)
if(data.empty==False):
    print("One of C encoding done successfully")

    
data = adjust_columns(data)
if(data.empty==False):
    print("No duplicate columns present")
    

features,target = data_for_features(data)
if(features.empty==False):
    print("All features loaded successfully")
if(target.empty==False):
    print("All target values loaded successfully")

if(feature_selection(features,target)==0):
    print("The best features have been determined")

    
trainData, testData = train_test_split(data)
if(trainData.empty==False):
    print("Training data loaded successfully")
if(testData.empty==False):
    print("Testing data loaded successfully")
    

trainData = oversampling(trainData)
if(trainData.empty==False):
    print("Training data has been oversampled successfully")


if(saving_datasets(trainData, testData)==0):
    print("Training and Testing data has been saved successfully")


Dataset loaded successfully
No missing data
Target value converted successfully
One of C encoding done successfully
No duplicate columns present
All features loaded successfully
All target values loaded successfully


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


The top 40 features are
                      Specs          Score
5   ProductRelated_Duration  877404.339414
8                PageValues  175126.808512
1   Administrative_Duration   41754.836841
3    Informational_Duration   35059.775770
4            ProductRelated   19317.285376
0            Administrative    1133.965531
2             Informational     357.981605
17                Month_Nov     223.548231
51            TrafficType_2     113.937321
52            TrafficType_3      70.477528
16                Month_May      54.997108
9                SpecialDay      53.797094
62           TrafficType_13      52.519206
22                     OS_3      48.546233
50            TrafficType_1      42.903495
15                Month_Mar      42.613274
57            TrafficType_8      39.174150
6               BounceRates      29.654336
7                 ExitRates      28.985072
12                Month_Feb      26.961176
21                     OS_2      20.651600
69           TrafficType_20   