In [32]:
from lightgbm import LGBMClassifier

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline

from scipy.stats import chi2_contingency
from pipelines import preprocessor, preprocessor2
from preprocess import extra_features, separate

In [14]:
train=pd.read_csv("Datasets/train.csv")
train=extra_features(train)
separate(train)

<IPython.core.display.Javascript object>

Target Column: ['cost_category']

Numerical Columns: ['night_mainland', 'night_zanzibar', 'total_nights']

Categorical Columns: ['age_group', 'travel_with', 'tour_arrangement', 'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz', 'package_sightseeing', 'package_guided_tour', 'package_insurance', 'first_trip_tz', 'trip_length', 'group_size']

Text Columns: ['country', 'purpose', 'main_activity', 'info_source']


In [15]:
X=train.drop('cost_category', axis=1)
y=train['cost_category']

In [16]:
col_drop=[]

for col in X.columns:
    
    ct=pd.crosstab(y, X[col])
    chi2, P, dof , _ = chi2_contingency(ct)
    
    if P**3 >= 0.05:
        col_drop.append(col)
    elif chi2 <= 1000:
            col_drop.append(col)
        
    print(f"Column: {col}\nChi_Test: {chi2}\nP_Value: {P}\nDegree: {dof}\n*************")

<IPython.core.display.Javascript object>

Column: country
Chi_Test: 9779.033678192025
P_Value: 0.0
Degree: 650
*************


<IPython.core.display.Javascript object>

Column: age_group
Chi_Test: 1803.196457648891
P_Value: 0.0
Degree: 20
*************


<IPython.core.display.Javascript object>

Column: travel_with
Chi_Test: 4358.8256594511395
P_Value: 0.0
Degree: 20
*************


<IPython.core.display.Javascript object>

Column: total_female
Chi_Test: 3730.647692424969
P_Value: 0.0
Degree: 115
*************


<IPython.core.display.Javascript object>

Column: total_male
Chi_Test: 1402.6656478001087
P_Value: 3.290821934232691e-220
Degree: 115
*************


<IPython.core.display.Javascript object>

Column: purpose
Chi_Test: 6975.312873617305
P_Value: 0.0
Degree: 35
*************


<IPython.core.display.Javascript object>

Column: main_activity
Chi_Test: 3606.50524881434
P_Value: 0.0
Degree: 40
*************


<IPython.core.display.Javascript object>

Column: info_source
Chi_Test: 4166.3943611855575
P_Value: 0.0
Degree: 35
*************


<IPython.core.display.Javascript object>

Column: tour_arrangement
Chi_Test: 8331.284428771029
P_Value: 0.0
Degree: 5
*************


<IPython.core.display.Javascript object>

Column: package_transport_int
Chi_Test: 5414.9841247089
P_Value: 0.0
Degree: 5
*************


<IPython.core.display.Javascript object>

Column: package_accomodation
Chi_Test: 7883.608233888307
P_Value: 0.0
Degree: 5
*************


<IPython.core.display.Javascript object>

Column: package_food
Chi_Test: 7146.769578021885
P_Value: 0.0
Degree: 5
*************


<IPython.core.display.Javascript object>

Column: package_transport_tz
Chi_Test: 6346.490489845604
P_Value: 0.0
Degree: 5
*************


<IPython.core.display.Javascript object>

Column: package_sightseeing
Chi_Test: 4501.574863783683
P_Value: 0.0
Degree: 5
*************


<IPython.core.display.Javascript object>

Column: package_guided_tour
Chi_Test: 4873.499054202309
P_Value: 0.0
Degree: 5
*************


<IPython.core.display.Javascript object>

Column: package_insurance
Chi_Test: 2067.872038967938
P_Value: 0.0
Degree: 5
*************


<IPython.core.display.Javascript object>

Column: night_mainland
Chi_Test: 5614.015301227679
P_Value: 0.0
Degree: 545
*************


<IPython.core.display.Javascript object>

Column: night_zanzibar
Chi_Test: 2652.1882265219597
P_Value: 0.0
Degree: 295
*************


<IPython.core.display.Javascript object>

Column: first_trip_tz
Chi_Test: 2229.180451768042
P_Value: 0.0
Degree: 5
*************


<IPython.core.display.Javascript object>

Column: total_nights
Chi_Test: 6523.200295117052
P_Value: 0.0
Degree: 580
*************


<IPython.core.display.Javascript object>

Column: trip_length
Chi_Test: 402.40609235137765
P_Value: 2.159271242364782e-76
Degree: 15
*************


<IPython.core.display.Javascript object>

Column: total_people
Chi_Test: 5188.487023293924
P_Value: 0.0
Degree: 155
*************


<IPython.core.display.Javascript object>

Column: group_size
Chi_Test: 686.1389145094461
P_Value: 1.6709403090257822e-136
Degree: 15
*************


In [17]:
col_drop

['trip_length', 'group_size']

In [18]:
X.drop(columns=col_drop, inplace=True)

In [19]:
X=preprocessor2(X)
X.shape

(18506, 35)

In [20]:
le=LabelEncoder()
y=le.fit_transform(y)

In [21]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.3, shuffle= True, random_state=42)

In [22]:
lgb=LGBMClassifier(n_estimators= 600,
                  n_jobs= 1,
                  learning_rate= 0.1,
                  objective= 'multiclass',
                  max_depth=8)

lgb.fit(X_train, y_train)

In [23]:
y_pred= lgb.predict(X_test)

In [24]:
print(classification_report(y_pred, y_test, target_names=le.classes_))

              precision    recall  f1-score   support

   High Cost       0.38      0.46      0.42       923
 Higher Cost       0.71      0.61      0.66      1676
Highest Cost       0.11      0.29      0.15        45
    Low Cost       0.15      0.26      0.19       259
  Lower Cost       0.59      0.56      0.58       774
 Normal Cost       0.63      0.56      0.59      1875

    accuracy                           0.54      5552
   macro avg       0.43      0.46      0.43      5552
weighted avg       0.58      0.54      0.56      5552



In [26]:
kfolds= StratifiedKFold(n_splits=10, shuffle=True)


for fold, (train_idx, val_idx) in enumerate(kfolds.split(X,y)):
    X_train, X_test=X[train_idx], X[val_idx]
    y_train, y_test=y[train_idx], y[val_idx]

lgb.fit(X_train, y_train)    

In [27]:
y_pred=lgb.predict(X_test)

In [29]:
print(classification_report(y_pred, y_test, target_names=le.classes_))

              precision    recall  f1-score   support

   High Cost       0.42      0.46      0.44       336
 Higher Cost       0.71      0.64      0.68       540
Highest Cost       0.08      0.17      0.11        18
    Low Cost       0.14      0.28      0.19        79
  Lower Cost       0.59      0.63      0.61       242
 Normal Cost       0.63      0.54      0.58       635

    accuracy                           0.55      1850
   macro avg       0.43      0.45      0.43      1850
weighted avg       0.58      0.55      0.56      1850



In [45]:
def strat(model):
    kfolds= StratifiedKFold(n_splits=10, shuffle=True)
    results= {"Fold_{}".format(fold): [] for fold in range(10)}
    
    for fold, (train_idx, val_idx) in enumerate(kfolds.split(X,y)):
        X_train, X_test=X[train_idx], X[val_idx]
        y_train, y_test=y[train_idx], y[val_idx]
    
        model.fit(X_train, y_train)
    
        y_pred=model.predict(X_test)
        
        
        F1_Score=f1_score(y_pred, y_test, average='weighted')
        results["Fold_{}".format(fold)].append(F1_Score)
        
    results['mean']=pd.DataFrame(results).mean(axis=1)    
    df= pd.DataFrame(results)
        
    return df

In [46]:
strat(lgb)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,Fold_0,Fold_1,Fold_2,Fold_3,Fold_4,Fold_5,Fold_6,Fold_7,Fold_8,Fold_9,mean
0,0.584084,0.568771,0.576134,0.536034,0.572813,0.551322,0.566061,0.566555,0.579566,0.580848,0.568219
