In [23]:
# @Author Euclidi Filippo - matr. 294517

# This file contains the final model tuned as best as possible
# The model chosen after comparing all the results is the Logistic Regression

In [24]:
#Importing all the needed libraries 

# Importing all needed libraries.

import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

# Features Selection methods imports
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
  
# Data preprocessing methods imports
from sklearn import preprocessing 

# Train test split imports
from sklearn.model_selection import train_test_split

# Score evaluating methods imports
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Classifications Models imports
from sklearn.linear_model import LogisticRegression

# Model Selection and Cross Validation imports
from sklearn.model_selection import KFold


In [25]:
# We are going to use this function to calculate the accuracy of the predicted values

def score(y_test, y_pred, specification):
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average=specification)
    c_mat = confusion_matrix(y_test, y_pred)
    return acc, f1, c_mat

In [26]:
# Lets re-create our function which we will use to determine the best number of features
def findBestFeatures(model, X_train, y_train):
    accuracy_list_train = []
    k=np.arange(1,21,1)
    for each in k:
        x_new = SelectKBest(score_func=chi2, k=each).fit_transform(X_train, y_train)
        model.fit(x_new,y_train)
        accuracy_list_train.append(model.score(x_new,y_train))   
    plt.plot(k,accuracy_list_train,color="green",label="train")
    plt.xlabel("k values")
    plt.ylabel("train accuracy")
    plt.legend()
    plt.show()

In [27]:
# Lets also use the kfold cross-validation as this dataset is not very big 
 
def kFoldCrossValidation(model, X, y):
    k = 10
    kf = KFold(n_splits=k, random_state=None)
    acc_score = []
    f1_score = []
    prec_matrix = []
    i = 0
        
    for train_index , test_index in kf.split(X):
        i+=1
        X_train , X_test = X[train_index,:],X[test_index,:]
        y_train , y_test = y[train_index] , y[test_index]
        
        model.fit(X_train,y_train)
        pred_values = model.predict(X_test)
        acc, f1, prec_m = score(y_test, pred_values, 'weighted')
        acc_score.append(acc)
        f1_score.append(f1)
        prec_matrix.append(prec_m)
        print("Round number "+ str(i) + ", Accuracy score: "+str(acc) + ", F1 Score:" + str(f1) 
          + ",\nConfusion Matrix:\n" + str(prec_m)+"\n")
            
    avg_acc_score = sum(acc_score)/k
    avg_f1_score = sum(f1_score)/k
    avg_prec_mat = sum(prec_matrix)/k
    print("Average results after 10 folds cross validation: ")
    print("Accuracy score: "+str(avg_acc_score) + " \nF1 Score:" + str(avg_f1_score) 
          + " \nAverage nConfusion Matrix:\n" + str(avg_prec_mat)+"\n")

In [28]:
# Setting train and test set by importing the datasets
df_train = pd.read_csv("./archive/train.csv")
df_test = pd.read_csv("./archive/test.csv")

In [29]:
# First of all lets create our dataset X (multidimensional array) by removing the price_range
# column which will be our target value to check how good is a classifier

# We should also make X a numpy array and we do that by using the .to_numpy() function

X = df_train.drop(columns = ['price_range'])
X = df_train.iloc[:, 0:-1]

X_t = df_test.drop(columns = ['id'])

# Visualizing the dataset
print("Training set:\n" + str(X.head()))
print("Test set:\n" + str(X_t.head()))
print(X)

Training set:
   battery_power  blue  clock_speed  dual_sim  fc  four_g  int_memory  m_dep  \
0            842     0          2.2         0   1       0           7    0.6   
1           1021     1          0.5         1   0       1          53    0.7   
2            563     1          0.5         1   2       1          41    0.9   
3            615     1          2.5         0   0       0          10    0.8   
4           1821     1          1.2         0  13       1          44    0.6   

   mobile_wt  n_cores  pc  px_height  px_width   ram  sc_h  sc_w  talk_time  \
0        188        2   2         20       756  2549     9     7         19   
1        136        3   6        905      1988  2631    17     3          7   
2        145        5   6       1263      1716  2603    11     2          9   
3        131        6   9       1216      1786  2769    16     8         11   
4        141        2  14       1208      1212  1411     8     2         15   

   three_g  touch_screen  wifi

In [30]:
# Now we will create our y array

y = df_train['price_range'].values

In [31]:
# Lets do the necessary preprocessing by applying the Standard Scaler 

std_scaler = preprocessing.StandardScaler().fit(X)

X = std_scaler.transform(X) #Preprocessed for better accuracy

X_t = std_scaler.transform(X_t)

print(X) #Lets visualize it

[[-0.90259726 -0.9900495   0.83077942 ... -1.78686097 -1.00601811
   0.98609664]
 [-0.49513857  1.0100505  -1.2530642  ...  0.55964063  0.99401789
  -1.01409939]
 [-1.5376865   1.0100505  -1.2530642  ...  0.55964063  0.99401789
  -1.01409939]
 ...
 [ 1.53077336 -0.9900495  -0.76274805 ...  0.55964063  0.99401789
  -1.01409939]
 [ 0.62252745 -0.9900495  -0.76274805 ...  0.55964063  0.99401789
   0.98609664]
 [-1.65833069  1.0100505   0.58562134 ...  0.55964063  0.99401789
   0.98609664]]


In [32]:
# Lets split our database in to training and test 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, stratify=y)

In [33]:
# Now lets train the Logistic Regression Classifier

print("Logistic Regression Classifier\n")

# We will do ten tries and report the results for each step
logreg = LogisticRegression()
kFoldCrossValidation(logreg, X, y)
logreg.fit(X_train, y_train)    
pred = logreg.predict(X_test)
acc, f1, c_mat = score(y_test, pred, 'weighted')
print("Final results using the full training and test set:\n" + "Accuracy score: "+str(acc)
      +", F1 Score:" + str(f1) + ",\nConfusion Matrix:\n" + str(c_mat))

final_prediction = logreg.predict(X_t)

df_test['price_range'] = final_prediction

df_test.head(10)

Logistic Regression Classifier

Round number 1, Accuracy score: 0.955, F1 Score:0.9548983099044298,
Confusion Matrix:
[[42  0  0  0]
 [ 2 45  2  0]
 [ 0  1 46  2]
 [ 0  0  2 58]]

Round number 2, Accuracy score: 0.98, F1 Score:0.9799914205685143,
Confusion Matrix:
[[46  1  0  0]
 [ 0 49  0  0]
 [ 0  0 48  2]
 [ 0  0  1 53]]

Round number 3, Accuracy score: 0.955, F1 Score:0.9548433251433253,
Confusion Matrix:
[[48  1  0  0]
 [ 2 42  1  0]
 [ 0  2 47  2]
 [ 0  0  1 54]]

Round number 4, Accuracy score: 0.95, F1 Score:0.950002225189141,
Confusion Matrix:
[[50  2  0  0]
 [ 3 42  1  0]
 [ 0  2 51  1]
 [ 0  0  1 47]]

Round number 5, Accuracy score: 0.97, F1 Score:0.9700098829141888,
Confusion Matrix:
[[51  1  0  0]
 [ 0 45  0  0]
 [ 0  2 41  1]
 [ 0  0  2 57]]

Round number 6, Accuracy score: 0.96, F1 Score:0.9600766178266178,
Confusion Matrix:
[[52  1  0  0]
 [ 0 47  2  0]
 [ 0  1 55  3]
 [ 0  0  1 38]]

Round number 7, Accuracy score: 0.96, F1 Score:0.9597574303707738,
Confusion Matrix:


Unnamed: 0,id,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,1,1043,1,1.8,1,14,0,5,0.1,193,...,226,1412,3476,12,7,2,0,1,0,3
1,2,841,1,0.5,1,4,1,61,0.8,191,...,746,857,3895,6,0,7,1,0,0,3
2,3,1807,1,2.8,0,1,0,27,0.9,186,...,1270,1366,2396,17,10,10,0,1,1,2
3,4,1546,0,0.5,1,18,1,25,0.5,96,...,295,1752,3893,10,0,7,1,1,0,3
4,5,1434,0,1.4,0,11,1,49,0.5,108,...,749,810,1773,15,8,7,1,0,1,1
5,6,1464,1,2.9,1,5,1,50,0.8,198,...,569,939,3506,10,7,3,1,1,1,3
6,7,1718,0,2.4,0,1,0,47,1.0,156,...,1283,1374,3873,14,2,10,0,0,0,3
7,8,833,0,2.4,1,0,0,62,0.8,111,...,1312,1880,1495,7,2,18,0,1,1,1
8,9,1111,1,2.9,1,9,1,25,0.6,101,...,556,876,3485,11,9,10,1,1,0,3
9,10,1520,0,0.5,0,1,0,25,0.5,171,...,52,1009,651,6,0,5,1,0,1,0
