In [47]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import shapiro, normaltest
from scipy.stats import skew 
from sklearn.metrics import r2_score,f1_score
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import Lasso,Ridge
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import precision_score
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectFromModel
from imblearn.over_sampling import RandomOverSampler
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

In [48]:
df = pd.read_csv('C:\\Users\\Nik8p\\Downloads\\file\\train.csv')

In [49]:
df.head()

Unnamed: 0,ID,Agency,Agency Type,Distribution Channel,Product Name,Duration,Destination,Net Sales,Commision (in value),Age,Claim
0,2010,EPX,Travel Agency,Online,Cancellation Plan,61,PHILIPPINES,12.0,0.0,41,0
1,4245,EPX,Travel Agency,Online,Cancellation Plan,4,MALAYSIA,17.0,0.0,35,0
2,9251,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,26,THAILAND,19.8,11.88,47,0
3,4754,EPX,Travel Agency,Online,2 way Comprehensive Plan,15,HONG KONG,27.0,0.0,48,0
4,8840,EPX,Travel Agency,Online,2 way Comprehensive Plan,15,MALAYSIA,37.0,0.0,36,0


In [50]:
def age_convert(age): 
    result = ''
    if(age <= 21):
        result = 'Child'
    elif(age <= 50):
        result = 'Adult'
    else:
        result = 'Senior'
    return result

In [51]:
# Write a function for data preprocessing

def data_pre_processing(df):
    # Add a new column in the Database as Age Group
    df['Age Group'] = df['Age'].apply(age_convert)
    
    # Since the minimum duration that any travel can have is 1 day thus we impute it by the column median.
    df['Duration'][df['Duration'] < 0] = df['Duration'].median()

    # As we observed duration of any travel cannot be more than 731 we will impute it as 731.
    df['Duration'][df['Duration'] > 731] = 731
    
    # replacing the values that is greater than 99 with the mean of Senior Age
    df['Age'][df['Age'] > 99] = df[df['Age Group'] == 'Senior']['Age'].mean()

In [52]:
data_pre_processing(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [53]:
def feature_processing(df):
    # Converting all categorical columns into numeric using frequency encoding, label encoding and one-hot encoding.
    fe = df.groupby('Destination').size()/len(df)
    df['Dest_fe'] = df['Destination'].map(fe)
    fe_1 = df.groupby('Agency').size()/len(df)
    df['Agency_fe'] = df['Agency'].map(fe_1)
    fe_2 = df.groupby('Product Name').size()/len(df)    
    df['Product Name_fe'] = df['Product Name'].map(fe_2)
    df.drop(columns='Agency',axis=1,inplace=True)
    df.drop(columns='Destination',axis=1,inplace=True)
    df.drop(columns='Product Name',axis=1,inplace=True)
    df.drop(columns='Age Group',axis=1,inplace=True)
    #df.drop(columns='Age',axis=1,inplace=True)
    #df = pd.get_dummies(df, columns=['Agency Type','Distribution Channel'], drop_first=True)

In [54]:
feature_processing(df)

In [55]:
df = pd.get_dummies(df, columns=['Agency Type','Distribution Channel'], drop_first=True)

In [34]:
#df.drop(columns='Net Sales',axis=1,inplace=True)

In [14]:
df.head(5)

Unnamed: 0,ID,Duration,Net Sales,Commision (in value),Age,Claim,Dest_fe,Agency_fe,Product Name_fe,Agency Type_Travel Agency,Distribution Channel_Online
0,2010,61,12.0,0.0,41.0,0,0.034353,0.501204,0.253374,1,1
1,4245,4,17.0,0.0,35.0,0,0.08333,0.501204,0.253374,1,1
2,9251,26,19.8,11.88,47.0,0,0.087211,0.130243,0.130243,1,1
3,4754,15,27.0,0.0,48.0,0,0.033932,0.501204,0.201778,1,1
4,8840,15,37.0,0.0,36.0,0,0.08333,0.501204,0.201778,1,1


In [28]:
df

Unnamed: 0,ID,Duration,Net Sales,Commision (in value),Age,Claim,Dest_fe,Agency_fe,Product Name_fe,Agency Type_Travel Agency,Distribution Channel_Online
0,2010,61,12.000000,0.0000,41.0,0,0.034353,0.501204,0.253374,1,1
1,4245,4,17.000000,0.0000,35.0,0,0.083330,0.501204,0.253374,1,1
2,9251,26,19.800000,11.8800,47.0,0,0.087211,0.130243,0.130243,1,1
3,4754,15,27.000000,0.0000,48.0,0,0.033932,0.501204,0.201778,1,1
4,8840,15,37.000000,0.0000,36.0,0,0.083330,0.501204,0.201778,1,1
...,...,...,...,...,...,...,...,...,...,...,...
52305,5370,365,444.750593,111.1858,40.0,1,0.272262,0.202925,0.005639,0,1
52306,5282,8,23.000000,9.2000,54.0,0,0.010801,0.010075,0.037488,0,1
52307,7083,38,61.100000,15.2800,37.0,0,0.272262,0.202925,0.090289,0,1
52308,8523,2,10.000000,0.0000,46.0,0,0.087211,0.501204,0.253374,1,1


In [56]:
X = df.drop('Claim', axis = 1)
Y = df['Claim']

In [35]:
#X = df.drop('ID', axis = 1)

In [57]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size = 0.8, random_state =100)

In [116]:
rf = RandomForestClassifier(max_features=8)
rf.fit(X_train,Y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=8,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [117]:
y_pred_rf_train = rf.predict(X_train)

print(accuracy_score(Y_train, y_pred_rf_train))
print(precision_score(Y_train, y_pred_rf_train))
print(f1_score(Y_train, y_pred_rf_train))

1.0
1.0
1.0


In [118]:
y_pred_rf = rf.predict(X_test)

print(accuracy_score(Y_test, y_pred_rf))
print(precision_score(Y_test, y_pred_rf))
print(f1_score(Y_test, y_pred_rf))

0.9317530108965781
0.8298397040690506
0.7903699354081033


In [None]:
rf = RandomForestClassifier(n_estimators=50,max_features=5,min_samples_leaf=1)
rf.fit(X_train, Y_train)
y_pred_rf = rf.predict(X_test)
print(accuracy_score(Y_test, y_pred_rf))
print(classification_report(Y_test, y_pred_rf))
precision_score(Y_test, y_pred_rf)

In [None]:
X = df.drop('Claim', axis = 1)
Y = df['Claim']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size = 0.8, random_state =200)
ros = RandomOverSampler()
x_train_ros, y_train_ros =  ros.fit_resample(X_train, Y_train)
print(x_train_ros.shape)
r = RandomForestClassifier(n_estimators=50,max_features=5,min_samples_leaf=1)
r.fit(x_train_ros, y_train_ros)
y_pred_r = r.predict(X_test)
print(accuracy_score(Y_test, y_pred_r))
print(classification_report(Y_test, y_pred_r))
print(precision_score(Y_test, y_pred_r))

In [None]:
X = df.drop('Claim', axis = 1)
Y = df['Claim']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size = 0.8, random_state =100)

In [None]:
rf1 = RandomForestClassifier()
param_grid = { 
    'n_estimators': [50,100,150,200,250],
    #'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [3,5,6,7,8,9],
    #'min_samples_leaf':[1]
    #'criterion' :['gini'] 
}
CV_rfc = GridSearchCV(estimator=rf1, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, Y_train)
print(CV_rfc.best_params_) 

In [36]:
X = df.drop('Claim', axis = 1)
Y = df['Claim']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, train_size = 0.9 , random_state =100)
rf = RandomForestClassifier(n_estimators=250,max_features=5,min_samples_leaf=1)
rf.fit(X_train, Y_train)
y_pred_rf = rf.predict(X_test)
print(accuracy_score(Y_test, y_pred_rf))
print(classification_report(Y_test, y_pred_rf))
print(precision_score(Y_test, y_pred_rf))

0.9359587077040719
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      4311
           1       0.85      0.77      0.81       920

    accuracy                           0.94      5231
   macro avg       0.90      0.87      0.89      5231
weighted avg       0.93      0.94      0.93      5231

0.8528347406513872


In [None]:
adaboost = AdaBoostClassifier()
adaboost.fit(X_train,Y_train)
y_pred_ada = adaboost.predict(X_test)
print(accuracy_score(Y_test, y_pred_ada))
print(classification_report(Y_test, y_pred_ada))
print(precision_score(Y_test, y_pred_ada))

In [None]:
logr = LogisticRegression()
logr.fit(X_train,Y_train)
y_pred_logi = logr.predict(X_test)
print(accuracy_score(Y_test, y_pred_logi))
print(classification_report(Y_test, y_pred_logi))
print(precision_score(Y_test, y_pred_logi))

In [37]:
test_df = pd.read_csv('test.csv')

In [38]:
test_df.head()

Unnamed: 0,ID,Agency,Agency Type,Distribution Channel,Product Name,Duration,Destination,Net Sales,Commision (in value),Age
0,17631,EPX,Travel Agency,Online,Cancellation Plan,192,HONG KONG,18.0,0.0,36
1,15064,EPX,Travel Agency,Online,1 way Comprehensive Plan,2,SINGAPORE,20.0,0.0,36
2,14139,C2B,Airlines,Online,Bronze Plan,13,SINGAPORE,13.5,3.38,24
3,19754,EPX,Travel Agency,Online,2 way Comprehensive Plan,133,"TAIWAN, PROVINCE OF CHINA",41.0,0.0,36
4,16439,C2B,Airlines,Online,Silver Plan,2,SINGAPORE,30.0,7.5,32


In [39]:
test_df.shape

(22421, 10)

In [40]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22421 entries, 0 to 22420
Data columns (total 10 columns):
ID                      22421 non-null int64
Agency                  22421 non-null object
Agency Type             22421 non-null object
Distribution Channel    22421 non-null object
Product Name            22421 non-null object
Duration                22421 non-null int64
Destination             22421 non-null object
Net Sales               22421 non-null float64
Commision (in value)    22421 non-null float64
Age                     22421 non-null int64
dtypes: float64(2), int64(3), object(5)
memory usage: 1.7+ MB


In [41]:
data_pre_processing(test_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [42]:
test_df.head()

Unnamed: 0,ID,Agency,Agency Type,Distribution Channel,Product Name,Duration,Destination,Net Sales,Commision (in value),Age,Age Group
0,17631,EPX,Travel Agency,Online,Cancellation Plan,192,HONG KONG,18.0,0.0,36.0,Adult
1,15064,EPX,Travel Agency,Online,1 way Comprehensive Plan,2,SINGAPORE,20.0,0.0,36.0,Adult
2,14139,C2B,Airlines,Online,Bronze Plan,13,SINGAPORE,13.5,3.38,24.0,Adult
3,19754,EPX,Travel Agency,Online,2 way Comprehensive Plan,133,"TAIWAN, PROVINCE OF CHINA",41.0,0.0,36.0,Adult
4,16439,C2B,Airlines,Online,Silver Plan,2,SINGAPORE,30.0,7.5,32.0,Adult


In [43]:
feature_processing(test_df)

In [44]:
test_df.head()

Unnamed: 0,ID,Agency Type,Distribution Channel,Duration,Net Sales,Commision (in value),Age,Dest_fe,Agency_fe,Product Name_fe
0,17631,Travel Agency,Online,192,18.0,0.0,36.0,0.0347,0.502208,0.257749
1,15064,Travel Agency,Online,2,20.0,0.0,36.0,0.271041,0.502208,0.044824
2,14139,Airlines,Online,13,13.5,3.38,24.0,0.271041,0.2024,0.088578
3,19754,Travel Agency,Online,133,41.0,0.0,36.0,0.016502,0.502208,0.199634
4,16439,Airlines,Online,2,30.0,7.5,32.0,0.271041,0.2024,0.054146


In [45]:
test_df = pd.get_dummies(test_df, columns=['Agency Type','Distribution Channel'], drop_first=True)

In [46]:
test_df.head()

Unnamed: 0,ID,Duration,Net Sales,Commision (in value),Age,Dest_fe,Agency_fe,Product Name_fe,Agency Type_Travel Agency,Distribution Channel_Online
0,17631,192,18.0,0.0,36.0,0.0347,0.502208,0.257749,1,1
1,15064,2,20.0,0.0,36.0,0.271041,0.502208,0.044824,1,1
2,14139,13,13.5,3.38,24.0,0.271041,0.2024,0.088578,0,1
3,19754,133,41.0,0.0,36.0,0.016502,0.502208,0.199634,1,1
4,16439,2,30.0,7.5,32.0,0.271041,0.2024,0.054146,0,1


In [47]:
y_pred_test = rf.predict(test_df)

In [48]:
data = {'ID':test_df['ID'],
        'Claim':y_pred_test}
y = pd.DataFrame(data)
y.head()

Unnamed: 0,ID,Claim
0,17631,0
1,15064,0
2,14139,0
3,19754,0
4,16439,0


In [49]:
y['Claim'].value_counts()

0    18912
1     3509
Name: Claim, dtype: int64

In [50]:
y.to_csv("submission1.csv")