In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
tabular_data = pd.read_csv(r'C:\Users\Dupe\Downloads\tabular-playground-series\train.csv')

In [3]:
tabular_data.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


In [4]:
#checking how many cells are left empty in the table
tabular_data.isnull().sum()

row_id      0
date        0
country     0
store       0
product     0
num_sold    0
dtype: int64

In [5]:
tabular_data['country'].value_counts()

Finland    8766
Norway     8766
Sweden     8766
Name: country, dtype: int64

In [6]:
tabular_data['store'].value_counts()

KaggleMart    13149
KaggleRama    13149
Name: store, dtype: int64

In [7]:
tabular_data['product'].value_counts()

Kaggle Hat        8766
Kaggle Mug        8766
Kaggle Sticker    8766
Name: product, dtype: int64

In [8]:
#convert that into integer type values, and transform it into a categorical column:
tabular_data.replace({'country':{'Finland':0,'Norway':1,'Sweden':2}, 'store':{'KaggleMart':0,'KaggleRama':1}, 'product':{'Kaggle Sticker':0,'Kaggle Hat':1,'Kaggle Mug':2}}, inplace=True)

In [9]:
tabular_data.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,0,0,2,329
1,1,2015-01-01,0,0,1,520
2,2,2015-01-01,0,0,0,146
3,3,2015-01-01,0,1,2,572
4,4,2015-01-01,0,1,1,911


In [10]:
X = tabular_data.drop(columns = ['row_id','date','num_sold'],axis=1)
Y = tabular_data['num_sold']

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, random_state=2)

In [12]:
# Algorithms
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import GaussianNB

In [13]:
#Stochastic Gradient Descent (SGD)
sgd = linear_model.SGDClassifier(max_iter=5, tol=None)
sgd.fit(X_train, Y_train)
Y_pred = sgd.predict(X_test)

sgd.score(X_train, Y_train)

acc_sgd = round(sgd.score(X_train, Y_train) * 100, 2)

In [14]:
#Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, Y_train)

Y_prediction = random_forest.predict(X_test)

random_forest.score(X_train, Y_train)
acc_random_forest = round(random_forest.score(X_train, Y_train) * 100, 2)

In [15]:
#Logistic regression
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

Y_pred = logreg.predict(X_test)

acc_log = round(logreg.score(X_train, Y_train) * 100, 2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [23]:
#K Nearest Neighbour
knn = KNeighborsClassifier(n_neighbors = 3) 
knn.fit(X_train, Y_train)  
Y_pred = knn.predict(X_test)  
acc_knn = round(knn.score(X_train, Y_train) * 100, 2)

In [24]:
#Gaussian Naive Bayes
gaussian = GaussianNB() 
gaussian.fit(X_train, Y_train)  
Y_pred = gaussian.predict(X_test)  
acc_gaussian = round(gaussian.score(X_train, Y_train) * 100, 2)

In [21]:
#Perceptron
perceptron = Perceptron(max_iter=10)
perceptron.fit(X_train, Y_train)

Y_pred = perceptron.predict(X_test)

acc_perceptron = round(perceptron.score(X_train, Y_train) * 100, 2)



In [25]:
#Linear Support Vector Machine
linear_svc = LinearSVC()
linear_svc.fit(X_train, Y_train)

Y_pred = linear_svc.predict(X_test)

acc_linear_svc = round(linear_svc.score(X_train, Y_train) * 100, 2)

In [27]:
#Decision tree
decision_tree = DecisionTreeClassifier() 
decision_tree.fit(X_train, Y_train)  
Y_pred = decision_tree.predict(X_test)  
acc_decision_tree = round(decision_tree.score(X_train, Y_train) * 100, 2)

In [28]:
#which is the best model?
results = pd.DataFrame({
    'Model': ['Support Vector Machines', 'KNN', 'Logistic Regression', 
              'Random Forest', 'Naive Bayes', 'Perceptron', 
              'Stochastic Gradient Decent', 
              'Decision Tree'],
    'Score': [acc_linear_svc, acc_knn, acc_log, 
              acc_random_forest, acc_gaussian, acc_perceptron, 
              acc_sgd, acc_decision_tree]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
1.55,Random Forest
1.55,Decision Tree
0.81,Support Vector Machines
0.81,Logistic Regression
0.71,KNN
0.45,Naive Bayes
0.37,Stochastic Gradient Decent
0.13,Perceptron


In [29]:
#Using the test data
test_data = pd.read_csv(r'C:\Users\Dupe\Downloads\tabular-playground-series\test.csv')
test_data.head()

Unnamed: 0,row_id,date,country,store,product
0,26298,2019-01-01,Finland,KaggleMart,Kaggle Mug
1,26299,2019-01-01,Finland,KaggleMart,Kaggle Hat
2,26300,2019-01-01,Finland,KaggleMart,Kaggle Sticker
3,26301,2019-01-01,Finland,KaggleRama,Kaggle Mug
4,26302,2019-01-01,Finland,KaggleRama,Kaggle Hat


In [30]:
#checking how many cells are left empty in the table
test_data.isnull().sum()

row_id     0
date       0
country    0
store      0
product    0
dtype: int64

In [31]:
#convert that into integer type values, and transform it into a categorical column:
test_data.replace({'country':{'Finland':0,'Norway':1,'Sweden':2}, 'store':{'KaggleMart':0,'KaggleRama':1}, 'product':{'Kaggle Sticker':0,'Kaggle Hat':1,'Kaggle Mug':2}}, inplace=True)

In [32]:
test_data.head()

Unnamed: 0,row_id,date,country,store,product
0,26298,2019-01-01,0,0,2
1,26299,2019-01-01,0,0,1
2,26300,2019-01-01,0,0,0
3,26301,2019-01-01,0,1,2
4,26302,2019-01-01,0,1,1


In [33]:
X = test_data.drop(columns = ['row_id','date'],axis=1)

In [34]:
X.head()

Unnamed: 0,country,store,product
0,0,0,2
1,0,0,1
2,0,0,0
3,0,1,2
4,0,1,1


In [38]:
# predicting on actual test data
y_result = random_forest.predict(X)
y_result[1:10]

array([288, 103, 307, 607, 147, 307, 546, 142, 492], dtype=int64)

In [39]:
#create a new column 'num_sold' to save predictions
test_data['num_sold'] = y_result

prediction = pd.DataFrame(test_data, columns= ['row_id', 'num_sold'])
export_csv = prediction.to_csv (r'C:\Users\Dupe\Downloads\tabular_playground.csv', index = None, header=True) 
export_csv
print('Successful!')

Successful!
