In [1]:
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is 
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
import pandas as pd

x_train = pd.read_csv('x_train_all.csv')
y_train = pd.read_csv('y_train_all.csv')
x_test = pd.read_csv('x_test_all.csv')
y_test = pd.read_csv('y_test_all.csv')

In [5]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
data = pd.read_csv('x_train_all.csv')
datatype = pd.read_csv('y_train_all.csv')
X1 = data.iloc[:,0:2304]  #independent columns
y1 = datatype.iloc[:,-1]    #target column i.e price range
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=2)
fit = bestfeatures.fit(X1,y1)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X1.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
#print(featureScores.nlargest(10,'Score'))  #print 10 best features
featureScoresAll50 = featureScores.nlargest(50,'Score')
featureScoresAll50

Unnamed: 0,Specs,Score
1263,1263,150633.369191
1264,1264,148051.674998
1216,1216,144958.677289
1311,1311,144287.318409
1215,1215,140869.000438
1168,1168,136870.402181
1312,1312,134552.189285
1310,1310,131485.851171
1121,1121,129990.460474
1262,1262,129791.235186


In [152]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
#30% dataset
tree_clf = DecisionTreeClassifier(max_depth=3, random_state=42) #Max_depth values also changes the accuracy scores
x_train_30, x_test, y_train_30, y_test = train_test_split(x_train, y_train, test_size=0.3, random_state=42)

tree_clf.fit(x_train_30, y_train_30)

In [107]:
y_pred_train_30=tree_clf.predict(x_train_30)
y_pred_test_30=tree_clf.predict(x_test)

In [104]:
from sklearn.metrics import accuracy_score
train_accuracy_30 = accuracy_score(y_train_30, y_pred_train_30)
print("Accuracy on the training set:", train_accuracy_30)

Accuracy on the training set: 0.9929234851835471


In [105]:
test_accuracy_30 = accuracy_score(y_test, y_pred_test_30)
print("Accuracy on the testing set:", test_accuracy_30)

Accuracy on the testing set: 0.8211214310285517


In [153]:
#60% dataset
x_train_60, x_test, y_train_60, y_test = train_test_split(x_train, y_train, test_size=0.6, random_state=42)
tree_clf.fit(x_train_60, y_train_60)

In [154]:
y_pred_train_60=tree_clf.predict(x_train_60)
y_pred_test_60=tree_clf.predict(x_test)

In [155]:
train_accuracy_60 = accuracy_score(y_train_60, y_pred_train_60)
print("Accuracy on the training set:", train_accuracy_60)

Accuracy on the training set: 0.478844169246646


In [156]:
test_accuracy_60 = accuracy_score(y_test, y_pred_test_60)
print("Accuracy on the testing set:", test_accuracy_60)

Accuracy on the testing set: 0.45614035087719296


In [49]:

tree_clf = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_clf.fit(x_train_30,y_train_30)
print(train_accuracy_30)


0.4449358690844759


In [50]:
#Cross Validation:
#30%
from sklearn.model_selection import cross_val_score
tree_clf=DecisionTreeClassifier(max_depth=3, random_state=42)
cross_score_30 = cross_val_score(tree_clf, x_train_30, y_train_30, cv=5)  # Adjust the number of folds as needed
print("Cross-validated accuracy scores:", cross_score_30)

Cross-validated accuracy scores: [0.44509948 0.42520265 0.43773029 0.45648968 0.4159292 ]


In [51]:
#60%
cross_score_60 = cross_val_score(tree_clf, x_train_60, y_train_60, cv=5)  
print("Cross-validated accuracy scores:", cross_score_60)

Cross-validated accuracy scores: [0.4742268  0.46193548 0.47096774 0.47612903 0.44903226]


In [86]:
#Precision Score:
#30%
from sklearn.metrics import precision_score
precision_score_train_30 = precision_score(y_train_30, y_pred_train_30,average="weighted")
print("Precision Score for Training: ",precision_score_train_30)
precision_score_test_30 = precision_score(y_test, y_pred_test_30,average="weighted")
print("Precision Score for Testing: ",precision_score_test_30)

Precision Score for Training:  0.508438639711504
Precision Score for Testing:  0.48930856306366155


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [87]:
#60%
precision_score_train_60 = precision_score(y_train_60, y_pred_train_60,average="weighted")
print("Precision Score for Training: ",precision_score_train_60)
precision_score_test_60 = precision_score(y_test, y_pred_test_60,average="weighted")
print("Precision Score for Testing: ",precision_score_test_60)

Precision Score for Training:  0.5015591144995052
Precision Score for Testing:  0.48737984849671323


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [82]:
#Recall Score:
#30%
from sklearn.metrics import recall_score
recall_score_train_30 = recall_score(y_train_30, y_pred_train_30,average="weighted")
print("Recall Score for Training: ", recall_score_train_30)
recall_score_test_30 = recall_score(y_test, y_pred_test_30,average="weighted")
print("Recall Score for Testing: ", recall_score_test_30)

Recall Score for Training:  0.4449358690844759
Recall Score for Testing:  0.4444444444444444


In [83]:
#60%
recall_score_train_60 = recall_score(y_train_60, y_pred_train_60,average="weighted")
print("Recall Score for Training: ", recall_score_train_60)
recall_score_test_60 = recall_score(y_test, y_pred_test_60,average="weighted")
print("Recall Score for Testing: ",recall_score_test_60)

Recall Score for Training:  0.478844169246646
Recall Score for Training:  0.45614035087719296


In [80]:
#F1 Score:
#30%
from sklearn.metrics import f1_score

f1_score_train_30 = f1_score(y_train_30, y_pred_train_30, average='weighted')
print("F1 Score for Training: ",f1_score_train_30)
f1_score_test_30 = f1_score(y_test, y_pred_test_30, average='weighted')
print("F1 Score for Testing: ",f1_score_test_30)

F1 Score for Training:  0.40194808174946645
F1 Score for Testing:  0.3991434872415531


In [81]:
#60%
f1_score_train_60 = f1_score(y_train_60, y_pred_train_60, average='weighted')
print("F1 Score for Training: ",f1_score_train_60)
f1_score_test_60 = f1_score(y_test, y_pred_test_60, average='weighted')
print("F1 Score for Testing: ",f1_score_test_60)

F1 Score for Training:  0.45080868746710656
F1 Score for Testing:  0.43076101364505004
