In [1]:
import pandas as pd   # package for data analysis
import numpy as np    # package for numerical computations

# libraries for visualization
import matplotlib.pyplot as plt
import seaborn as sns

# to ignore warnings
import warnings
warnings.filterwarnings('ignore')

# For Preprocessing, ML models and Evaluation
from sklearn.model_selection import train_test_split   # To split the dataset into train and test set

from sklearn.linear_model import LogisticRegression     # Logistic regression model

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder    # for converting categorical to numerical

from sklearn.metrics import f1_score    # for model evaluation

In [46]:
data = pd.read_excel('Data_Train.xlsx')
data.head()

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ‘...",1
4,"In global markets, gold prices edged up today ...",3


In [47]:
data.shape

(7628, 2)

In [48]:
data.describe()

Unnamed: 0,SECTION
count,7628.0
mean,1.357892
std,0.999341
min,0.0
25%,1.0
50%,1.0
75%,2.0
max,3.0


In [49]:
data.isnull().sum()

STORY      0
SECTION    0
dtype: int64

In [50]:
X = data['STORY']
y = data['SECTION']

In [51]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [52]:
X = cv.fit_transform(X)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 18)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6102, 32810), (6102,), (1526, 32810), (1526,))

In [77]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB(alpha=0.5)

In [78]:
nb.fit(X_train,y_train)

MultinomialNB(alpha=0.5)

In [79]:
nb.score(X_test, y_test)

0.9764089121887287

In [80]:
y_pred = nb.predict(X_test)
y_pred

array([1, 2, 1, ..., 0, 1, 0], dtype=int64)

In [82]:
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,y_pred))
print('\n')
print(classification_report(y_test,y_pred))

[[329  10   2   4]
 [  6 517   2   6]
 [  2   4 385   0]
 [  0   0   0 259]]


              precision    recall  f1-score   support

           0       0.98      0.95      0.96       345
           1       0.97      0.97      0.97       531
           2       0.99      0.98      0.99       391
           3       0.96      1.00      0.98       259

    accuracy                           0.98      1526
   macro avg       0.98      0.98      0.98      1526
weighted avg       0.98      0.98      0.98      1526



In [83]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier

In [85]:
# SVM Classification
svc = SVC(kernel='sigmoid', gamma=1.0)
svc.fit(X_train,y_train)
svc.score(X_test, y_test)

0.2981651376146789

In [86]:
y_svc = svc.predict(X_test)

In [91]:
dtc = DecisionTreeClassifier(min_samples_split=9, random_state=252)
dtc.fit(X_train,y_train)
y_dtc = dtc.predict(X_test)

In [92]:
dtc.score(X_test, y_test)

0.8571428571428571

In [95]:
etc = ExtraTreesClassifier(n_estimators=900, random_state=252)
etc.fit(X_train,y_train)
y_etc = etc.predict(X_test)
etc.score(X_test, y_test)

0.9574049803407602

In [97]:
rfc = RandomForestClassifier(n_estimators=37, random_state=252)
rfc.fit(X_train,y_train)
rfc.score(X_test, y_test)

0.9305373525557011

In [98]:
y_rfc = rfc.predict(X_test)

In [99]:
abc = AdaBoostClassifier(n_estimators=37, random_state=252)
abc.fit(X_train,y_train)
abc.score(X_test, y_test)

0.836173001310616

In [101]:
bc = BaggingClassifier(n_estimators=9, random_state=252)
bc.fit(X_train,y_train)
bc.score(X_test,y_test)

0.8958060288335518

In [73]:
test_data = pd.read_excel('Data_Test.xlsx')
test_data.head()

Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...


In [74]:
X = test_data['STORY']
X = cv.transform(X)

In [75]:
test_predict = nb.predict(X)
test_predict

array([1, 2, 1, ..., 1, 0, 1], dtype=int64)

In [76]:
y_test_pred = pd.DataFrame(test_predict,columns=[   'SECTION'   ])
y_test_pred.to_csv('submission.csv', index=False)