In [7]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.svm import SVC 

from scipy.stats import chi2_contingency
from pipelines import preprocessor, preprocessor2
from preprocess import extra_features, separate

In [2]:
train=pd.read_csv('Datasets/train.csv')
train=extra_features(train)

<IPython.core.display.Javascript object>

In [3]:
separate(train)

Target Column: ['cost_category']

Numerical Columns: ['night_mainland', 'night_zanzibar', 'total_nights']

Categorical Columns: ['age_group', 'travel_with', 'tour_arrangement', 'package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz', 'package_sightseeing', 'package_guided_tour', 'package_insurance', 'first_trip_tz', 'trip_length', 'group_size']

Text Columns: ['country', 'purpose', 'main_activity', 'info_source']


In [4]:
X=train.drop('cost_category', axis=1)
y=train['cost_category']

In [10]:
col_drop=[]

for col in X.columns:
    
    ct=pd.crosstab(y, X[col])
    chi2, P, _ , _ = chi2_contingency(ct)
    
    if P >= 0.05:
        col_drop.append(col)
    elif chi2 <= 1000:
            col_drop.append(col)
     
    print(f"Column: {col}\nChi_Test: {chi2}\nP_Value: {P}\n---------------")

<IPython.core.display.Javascript object>

Column: country
Chi_Test: 9779.033678192025
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: age_group
Chi_Test: 1803.196457648891
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: travel_with
Chi_Test: 4358.8256594511395
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: total_female
Chi_Test: 3730.647692424969
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: total_male
Chi_Test: 1402.6656478001087
P_Value: 3.290821934232691e-220
---------------


<IPython.core.display.Javascript object>

Column: purpose
Chi_Test: 6975.312873617305
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: main_activity
Chi_Test: 3606.50524881434
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: info_source
Chi_Test: 4166.3943611855575
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: tour_arrangement
Chi_Test: 8331.284428771029
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: package_transport_int
Chi_Test: 5414.9841247089
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: package_accomodation
Chi_Test: 7883.608233888307
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: package_food
Chi_Test: 7146.769578021885
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: package_transport_tz
Chi_Test: 6346.490489845604
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: package_sightseeing
Chi_Test: 4501.574863783683
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: package_guided_tour
Chi_Test: 4873.499054202309
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: package_insurance
Chi_Test: 2067.872038967938
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: night_mainland
Chi_Test: 5614.015301227679
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: night_zanzibar
Chi_Test: 2652.1882265219597
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: first_trip_tz
Chi_Test: 2229.180451768042
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: total_nights
Chi_Test: 6523.200295117052
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: trip_length
Chi_Test: 402.40609235137765
P_Value: 2.159271242364782e-76
---------------


<IPython.core.display.Javascript object>

Column: total_people
Chi_Test: 5188.487023293924
P_Value: 0.0
---------------


<IPython.core.display.Javascript object>

Column: group_size
Chi_Test: 686.1389145094461
P_Value: 1.6709403090257822e-136
---------------


In [12]:
X.drop(columns=col_drop, inplace=True)

In [13]:
X=preprocessor(X)

In [14]:
X.shape

(18506, 57)

In [15]:
le=LabelEncoder()
y=le.fit_transform(y)

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X, y, shuffle=True, random_state=1)

In [17]:
cl=SVC(gamma="auto", kernel='rbf')
cl.fit(X_train, y_train)

In [19]:
y_pred=cl.predict(X_test)

In [21]:
print(classification_report(y_test, y_pred, zero_division=True, target_names=le.classes_))

              precision    recall  f1-score   support

   High Cost       0.51      0.33      0.40       962
 Higher Cost       0.59      0.80      0.68      1197
Highest Cost       1.00      0.00      0.00        95
    Low Cost       0.00      0.00      0.00       386
  Lower Cost       0.55      0.61      0.58       618
 Normal Cost       0.53      0.65      0.58      1369

    accuracy                           0.55      4627
   macro avg       0.53      0.40      0.37      4627
weighted avg       0.51      0.55      0.51      4627



In [22]:
folds= StratifiedKFold(n_splits=10, shuffle=True, random_state=42 )

for fold, (train_idx, val_idx) in enumerate (folds.split(X,y)):
    X_train, X_test = X[train_idx], X[val_idx]
    y_train, y_test = y[train_idx], y[val_idx]

In [24]:
cl.fit(X_train, y_train)

y_pred=cl.predict(X_test)

print(classification_report(y_test, y_pred,zero_division=True, target_names= le.classes_))

              precision    recall  f1-score   support

   High Cost       0.46      0.35      0.40       367
 Higher Cost       0.61      0.77      0.68       487
Highest Cost       1.00      0.00      0.00        36
    Low Cost       1.00      0.00      0.00       156
  Lower Cost       0.58      0.65      0.61       257
 Normal Cost       0.54      0.66      0.59       547

    accuracy                           0.56      1850
   macro avg       0.70      0.40      0.38      1850
weighted avg       0.60      0.56      0.52      1850



In [26]:
scores=cross_val_score(cl, X_train, y_train, scoring='accuracy', cv=10)
scores.mean()

<IPython.core.display.Javascript object>

0.5481497463850405

In [27]:
clf=SVC()

C_values = [0.1, 1, 10, 100, 1000]
for C in C_values:
    clf.C = C
    clf.fit(X_train, y_train)
    print("C = {}: accuracy = {}".format(C, clf.score(X_val, y_val)))

# Tune the kernel parameter
kernels = [ "linear", "rbf"]
for kernel in kernels:
    clf.kernel = kernel
    clf.fit(X_train, y_train)
    print("Kernel = {}: accuracy = {}".format(kernel, clf.score(X_val, y_val)))

# Tune the decision_function_shape parameter
decision_function_shapes = ["ovo", "ovr"]
for decision_function_shape in decision_function_shapes:
    clf.decision_function_shape = decision_function_shape
    clf.fit(X_train, y_train)
    print("Decision_function_shape = {}: accuracy = {}".format(decision_function_shape, clf.score(X_val, y_test)))

# Tune the class_weight parameter
class_weights = {0: 1, 1: 2}
clf.class_weight = class_weights
clf.fit(X_train, y_train)
print("Class_weight = {}: accuracy = {}".format(class_weights, clf.score(X_val, y_val)))

NameError: name 'X_val' is not defined