In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest, AdaBoostRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_regression, f_classif, SelectKBest
from scipy.stats import pearsonr
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn import tree
from sklearn.metrics import classification_report

In [2]:
train_data = pd.read_csv("train.tsv", sep='\t')
test_data = pd.read_csv("test.tsv", sep='\t')

In [3]:
one_hot_df = pd.get_dummies(train_data, columns=['ATM_Zone', 'ATM_Placement','ATM_TYPE','ATM_Location_TYPE','ATM_looks','ATM_Attached_to','Day_Type'])
one_hot_cols = one_hot_df.columns.tolist()

In [42]:
Y_reg = one_hot_df['revenue']
Y_cla = one_hot_df['rating']
X = one_hot_df.drop(['revenue', 'rating'], axis=1)

selector1 = SelectKBest(f_regression, k=16)
selector1.fit(X, Y_reg)
top_features1 = X.columns[selector1.get_support()].tolist()
X_reg = X[top_features1]

selector2 = SelectKBest(f_classif, k=21)
selector2.fit(X, Y_cla)
top_features2 = X.columns[selector2.get_support()].tolist()
X_cla = X[top_features2]

In [43]:
scaler = StandardScaler().fit(X_reg)
scaled_X_reg = scaler.transform(X_reg)

Y_reg = np.log(Y_reg)

X_train_reg, X_test_reg, Y_train_reg, Y_test_reg = train_test_split(scaled_X_reg, Y_reg, test_size=0.2)
X_train_cla, X_test_cla, Y_train_cla, Y_test_cla = train_test_split(X_cla, Y_cla, test_size=0.2)

In [6]:
model = GradientBoostingRegressor(n_estimators = 1000,max_depth=7)
model.fit(X_train_reg, Y_train_reg)
pre = model.predict(X_test_reg)
pearsonr(Y_test_reg,pre)[0]

0.9907010823288985

In [44]:
model2 = tree.DecisionTreeClassifier(criterion="entropy")
model2.fit(X_train_cla,Y_train_cla)
Y_pred = model2.predict(X_test_cla)
classification_report(Y_test_cla,Y_pred, output_dict = True)

{'2': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 785},
 '3': {'precision': 0.9837937143232072,
  'recall': 0.9893111638954869,
  'f1-score': 0.9865447248083693,
  'support': 15156},
 '4': {'precision': 0.9867360644728005,
  'recall': 0.9795816318026502,
  'f1-score': 0.9831458324620468,
  'support': 11999},
 '5': {'precision': 0.9980997624703087,
  'recall': 0.9990489776509748,
  'f1-score': 0.998574144486692,
  'support': 2103},
 'accuracy': 0.9863861798089405,
 'macro avg': {'precision': 0.992157385316579,
  'recall': 0.9919854433372779,
  'f1-score': 0.992066175439277,
  'support': 30043},
 'weighted avg': {'precision': 0.9863937480400334,
  'recall': 0.9863861798089405,
  'f1-score': 0.9863808580622194,
  'support': 30043}}

In [45]:
test_df = pd.get_dummies(test_data, columns=['ATM_Zone', 'ATM_Placement','ATM_TYPE','ATM_Location_TYPE','ATM_looks','ATM_Attached_to','Day_Type'])
test_Y_reg = test_df['revenue']
test_Y_cla = test_df['rating']
test_X = test_df.drop(['revenue', 'rating'], axis=1)
test_X_reg = test_X[top_features1]
test_X_cla = test_X[top_features2]
scaler_test_X_reg = scaler.transform(test_X_reg)
pre = model.predict(scaler_test_X_reg)
test_Y_reg = np.log(test_Y_reg)
pearsonr(test_Y_reg,pre)[0]

0.9904019339169388

In [46]:
Y_pred = model2.predict(test_X_cla)
report = classification_report(test_Y_cla,Y_pred, output_dict = True)
print(report['accuracy'])
print(report['macro avg']['recall'])
print(report['macro avg']['f1-score'])

0.9873060648801129
0.9926643676322202
0.9928861751066935


In [48]:
importances = model.feature_importances_
feature_importances = pd.DataFrame({'feature': top_features1, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False).reset_index(drop=True)
feature_importances

Unnamed: 0,feature,importance
0,Estimated_Number_of_Houses_in_1_KM_Radius,0.466546
1,No_of_Other_ATMs_in_1_KM_radius,0.201806
2,Average_Wait_Time,0.0797
3,ATM_Zone_RM,0.060932
4,ATM_Attached_to_Petrol Bunk,0.034209
5,ATM_Zone_C,0.02984
6,ATM_TYPE_Urban,0.026322
7,ATM_Zone_FV,0.020826
8,ATM_Location_TYPE_Passbook Printing and Withdraw,0.018584
9,Day_Type_Working,0.012562


In [49]:
importances = model2.feature_importances_
feature_importances = pd.DataFrame({'feature': top_features2, 'importance': importances})
feature_importances = feature_importances.sort_values('importance', ascending=False).reset_index(drop=True)
feature_importances

Unnamed: 0,feature,importance
0,Estimated_Number_of_Houses_in_1_KM_Radius,0.40605
1,No_of_Other_ATMs_in_1_KM_radius,0.224377
2,Number_of_Shops_Around_ATM,0.117402
3,Average_Wait_Time,0.070201
4,ATM_Attached_to_Petrol Bunk,0.032484
5,ATM_Zone_FV,0.026503
6,ATM_Zone_RL,0.019096
7,Day_Type_Working,0.016967
8,ATM_TYPE_Urban,0.014482
9,ATM_TYPE_Town,0.01203
