In [0]:
!pip install category_encoders
!pip install catboost

In [0]:
from google.colab import drive
drive.mount('/content/drive')

%cd drive/My\ Drive/Datasets/Janata_hackathon_05

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from statistics import mode
import warnings
warnings.filterwarnings('ignore')
np.random.seed(0)

In [0]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
len(train_df), len(test_df)

In [0]:
%cd ..
%cd ..
%cd ..
%cd ..

In [0]:
train_df.head()

In [0]:
train_df.isna().sum()

In [0]:
test_df.isna().sum()

In [0]:
train_df['Surge_Pricing_Type'].value_counts()

In [0]:
cat_cols = ['Type_of_Cab', 'Confidence_Life_Style_Index', 'Destination_Type', 'Gender']
con_cols = ['Life_Style_Index', 'Var1', 'Customer_Since_Months']

In [0]:
Y = train_df['Surge_Pricing_Type'].values
ids = test_df['Trip_ID']
train_df = train_df.drop(['Surge_Pricing_Type'], axis=1)

In [0]:
train_size = len(train_df)
df = pd.concat([train_df, test_df],axis=0)
df = df.drop(['Trip_ID'], axis=1)
cols = list(df.columns.values)
df.head(3)

In [0]:
df['Customer_Since_Months'].hist(bins=10)

In [0]:
df['Customer_Rating'].hist(bins=15)

In [0]:
df['Type_of_Cab'].value_counts().plot(kind='bar')

In [0]:
df['Gender'].value_counts().plot(kind='bar')

In [0]:
ce_bin = ce.OrdinalEncoder(cols=cat_cols)
df = ce_bin.fit_transform(df)

In [0]:
imputer = IterativeImputer(max_iter=100, initial_strategy='most_frequent', random_state=0)
new_df = imputer.fit_transform(df)
j = 0 
for i in list(df.columns.values):
  df[i] = new_df[:,j]
  j += 1

df.head()

In [0]:
ce_bin = ce.OneHotEncoder(cols=cat_cols)
df = ce_bin.fit_transform(df)
df.head()

In [0]:
df=((df-df.min())/(df.std()))
df.head(3)

In [0]:
train_df = df[:train_size].copy()
test_df = df[train_size:].copy()

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
X_train, X_test, y_train, y_test = train_test_split(train_df, Y,test_size=0.2, random_state=0, stratify=Y)
X_train.shape, X_test.shape

In [0]:
from sklearn.linear_model import LogisticRegression as LR
lr = LR(max_iter=500, class_weight={1:1,2:1,3:1})
lr.fit(X_train, y_train)
print(acc(y_test,lr.predict(X_test)))

In [0]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(iterations=800, custom_metric=['Accuracy'])
cat.fit(X_train, y_train, eval_set=(X_test,y_test), use_best_model=True)
print(acc(y_test,cat.predict(X_test)))

In [0]:
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=750, max_depth=4, seed=0, verbose=100)
xgb.fit(X_train,y_train, early_stopping_rounds=50, eval_metric="mlogloss",eval_set=[(X_train, y_train), (X_test, y_test)])
print(acc(y_test,xgb.predict(X_test)))

In [0]:
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(n_estimators=750, max_depth=4, random_state=0, verbose=50)
lgb.fit(X_train,y_train, early_stopping_rounds=50, eval_metric="multi_logloss",eval_set=[(X_train, y_train), (X_test, y_test)])
print(acc(y_test,lgb.predict(X_test)))

In [0]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score as acc
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cnt = 0
for train_index, test_index in kfold.split(train_df, Y):
  X_train, X_test = train_df.loc[train_index], train_df.loc[test_index]
  y_train, y_test = Y[train_index], Y[test_index] ;cnt+=1

  print('Fold '+str(cnt)+' : ')

  lr = LR(max_iter=500)
  lr.fit(X_train, y_train)
  print(' LR : ',end='')
  print(acc(y_test,lr.predict(X_test)))

  lgb = LGBMClassifier(n_estimators=507, max_depth=4, random_state=0)
  lgb.fit(X_train, y_train)
  print('LGM : ',end='')
  print(acc(y_test,lgb.predict(X_test)))

  cat = CatBoostClassifier(iterations=530, logging_level='Silent')
  cat.fit(X_train, y_train, eval_set=(X_test,y_test), use_best_model=True)
  print('Cat : ',end='')
  print(acc(y_test,cat.predict(X_test)))

  xgb = XGBClassifier(n_estimators=482, max_depth=4, seed=0)
  xgb.fit(X_train, y_train)
  print('XGB : ',end='')
  print(acc(y_test,xgb.predict(X_test)))

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score as acc
X_train, X_test, y_train, y_test = train_test_split(train_df, Y,test_size=0.2, random_state=0, stratify=Y)
X_train.shape, X_test.shape

In [0]:
from sklearn.ensemble import VotingClassifier as VC

clf = VC(estimators=[('cat',cat), ('lgb',lgb), ('xgb',xgb)], voting='soft')
clf.fit(X_train,y_train)
print(acc(y_test,clf.predict(X_test)))

In [0]:
xgb.fit(train_df,Y)
lgb.fit(train_df,Y)
cat.fit(train_df,Y)
clf.fit(train_df,Y)

In [0]:
predx = xgb.predict(test_df)
predl = lgb.predict(test_df)
predc = cat.predict(test_df)[:,0]

predx.shape, predl.shape, predc.shape
from collections import Counter

p=1;q=1;r=1;
y_pred = [ Counter( [i]*p + [j]*q + [k]*r ).most_common(1)[0][0] for i,j,k in zip(predx,predl,predc)]

y_pred = clf.predict(test_df)
y_pred.shape

In [0]:
sub = pd.DataFrame({'Trip_ID':ids,'Surge_Pricing_Type':y_pred})
sub.to_csv('subs.csv', index=False)
sub.head()