In [558]:
import pandas as pd
import numpy as np

In [559]:
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test_features.csv')

In [560]:
train.head()

Unnamed: 0,ID,Text,Category
0,969,@JuliaBradbury @SimonCalder @walsop @HodderPRI...,0
1,241,or here https://t.co/R2tO79Easn … .An in house...,1
2,820,@britshmuseum @thehistoryguy Gosh periscope is...,2
3,693,@Ophiolatrist britishmuseum The stupid #French...,1
4,421,@SassyClde We won't stop til @britishmuseum du...,1


In [561]:
test.head()

Unnamed: 0,ID,Text
0,1861,Goodbye @kettlesyard see you in .25 years! htt...
1,354,"@BBC_Culture @PlymouthMuseum Oh dear, why not ..."
2,1334,Fantastic @johnmcdonnellMP standing up for wor...
3,906,"@BBC_Culture @PlymouthMuseum Oh dear, why not ..."
4,1290,@britishmuseum @TripAdvisor it is !


In [562]:
text = train['Text']

In [563]:
text.head()

0    @JuliaBradbury @SimonCalder @walsop @HodderPRI...
1    or here https://t.co/R2tO79Easn … .An in house...
2    @britshmuseum @thehistoryguy Gosh periscope is...
3    @Ophiolatrist britishmuseum The stupid #French...
4    @SassyClde We won't stop til @britishmuseum du...
Name: Text, dtype: object

In [564]:
text.shape

(1600,)

## Word Tokenization and Removing StopWords

In [565]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [566]:
text[0]

'@JuliaBradbury @SimonCalder @walsop @HodderPRIBA @_TheWhitechapel A pleasure to meet you all last week!'

In [567]:
stop_words = stopwords.words('english')
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [568]:
## Pre-processing the text message

message = []
for i in range(len(text)):
    s = word_tokenize(text[i])
    temp = []  
    for j in s:
        if len(j) > 1 and j not in stop_words:  
            temp.append(j)
    t = ' '.join(temp)  
    message.append(t)  

print(message)

['JuliaBradbury SimonCalder walsop HodderPRIBA _TheWhitechapel pleasure meet last week', "https //t.co/R2tO79Easn .An house solutio proposed meet flexibility needs Come NationalGallery 's time TALK", 'britshmuseum thehistoryguy Gosh periscope definitely trend High profile people using even tried yet', 'Ophiolatrist britishmuseum The stupid French prick crushed Russian Imperial ancestors Waterloo200', "SassyClde We wo n't stop til britishmuseum dumps BP Fill feedback form 're amp watch space http //t.co/o2JlUnGi1y", "tateliverpool Member 's Preview JacksonPollock BlindSpots yesterday TateLiverpool FANTASTIC exhibition congratulations", 'britishuseum How British double line formation squad fire work surely entire battalion one long line deep', 'James Richards extraordinary instln _TheWhitechapel brilliantly obliquely piece critical interpretation Sonic art theory Lush', '300 miles see art In case might worth tateliverpool http //t.co/4Eqch03Mb5 http //t.co/18Nokgmvv5', 'NationalGallery f

In [569]:
for i in range(len(train)):
    train.loc[i,'Text'] = message[i]

In [570]:
train.head(4)

Unnamed: 0,ID,Text,Category
0,969,JuliaBradbury SimonCalder walsop HodderPRIBA _...,0
1,241,https //t.co/R2tO79Easn .An house solutio prop...,1
2,820,britshmuseum thehistoryguy Gosh periscope defi...,2
3,693,Ophiolatrist britishmuseum The stupid French p...,1


In [571]:
train.shape

(1600, 3)

In [572]:
text = train['Text']

In [573]:
text

0       JuliaBradbury SimonCalder walsop HodderPRIBA _...
1       https //t.co/R2tO79Easn .An house solutio prop...
2       britshmuseum thehistoryguy Gosh periscope defi...
3       Ophiolatrist britishmuseum The stupid French p...
4       SassyClde We wo n't stop til britishmuseum dum...
                              ...                        
1595                tateliverpool Great stuff 'll ASAP :0
1596    View Estaque Cézanne atrisk exported FitzMuseu...
1597    Agreed rhiannonakelly Eat sandwiches britishmu...
1598    BAGcurators NationalGallery Yes It 's hard bel...
1599    _TheWhitechapel `` Toward Parliament square du...
Name: Text, Length: 1600, dtype: object

## Doing Lemmitization

In [574]:
import re
from nltk.stem import WordNetLemmatizer

In [575]:
lemma  = WordNetLemmatizer()

In [576]:
corpus = []
for i in range(len(text)):
    temp = re.sub('[^a-zA-Z]',' ',text[i])
    temp = temp.lower()
    temp = temp.split()
    temp = [lemma.lemmatize(p) for p in temp if p not in set(stopwords.words('english'))]
    temp = ' '.join(temp)
    corpus.append(temp)

In [577]:
for i in range(len(train)):
    train.loc[i,'Text'] = corpus[i]

In [578]:
train.head(4)

Unnamed: 0,ID,Text,Category
0,969,juliabradbury simoncalder walsop hodderpriba t...,0
1,241,http co r easn house solutio proposed meet fle...,1
2,820,britshmuseum thehistoryguy gosh periscope defi...,2
3,693,ophiolatrist britishmuseum stupid french prick...,1


In [579]:
train.shape

(1600, 3)

In [580]:
test.shape

(400, 2)

## Bag of words (bow)

In [486]:
## To convert from text to vector

In [487]:
from sklearn.feature_extraction.text import CountVectorizer

In [488]:
cv = CountVectorizer(max_features=500)
bow_x = cv.fit_transform(train['Text']).toarray()

In [489]:
bow_x

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [490]:
bow_x.shape

(1600, 500)

In [491]:
# I have 1600 sentence and each contain 1000 unique words

In [492]:
train_df = pd.DataFrame(bow_x)

In [493]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [494]:
train_df['target'] = train['Category']

In [495]:
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,2
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


## Train Test Split

In [496]:
from sklearn.model_selection import train_test_split

In [497]:
x = train_df.iloc[:,:-1]

In [498]:
x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [499]:
y = train_df.iloc[:,-1]

In [500]:
y.head()

0    0
1    1
2    2
3    1
4    1
Name: target, dtype: int64

In [501]:
x_train , x_test , y_train , y_test = train_test_split(x , y , 
                                                       test_size=0.3 ,random_state= 108)

In [502]:
print(x_train.shape)
print(x_test.shape)

print(y_train.shape)
print(y_test.shape)

(1120, 500)
(480, 500)
(1120,)
(480,)


In [503]:
## Budiling Model
# pipeline package to merge NLP and Machine Learning together and get the output
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

In [504]:
xgb_classifier = Pipeline([
                       ('Classifier',XGBClassifier())
                      ])

In [505]:
xgb_classifier.fit(x_train,y_train)

In [506]:
y_pred_train = xgb_classifier.predict(x_train)
y_pred_test = xgb_classifier.predict(x_test)

In [507]:
# Evaluation Matrix
from sklearn.metrics import confusion_matrix  , classification_report , accuracy_score

In [508]:
print(confusion_matrix(y_train,y_pred_train))
print("***********"*10)
print(confusion_matrix(y_test,y_pred_test))

[[277   0   1   0]
 [  3 285   0   0]
 [  6   0 286   0]
 [  0   0   0 262]]
**************************************************************************************************************
[[102   2   6   6]
 [  4 117   0   0]
 [  7   0  93   1]
 [  6   1   1 134]]


In [509]:
print(classification_report(y_train,y_pred_train))
print("***********"*10)
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       278
           1       1.00      0.99      0.99       288
           2       1.00      0.98      0.99       292
           3       1.00      1.00      1.00       262

    accuracy                           0.99      1120
   macro avg       0.99      0.99      0.99      1120
weighted avg       0.99      0.99      0.99      1120

**************************************************************************************************************
              precision    recall  f1-score   support

           0       0.86      0.88      0.87       116
           1       0.97      0.97      0.97       121
           2       0.93      0.92      0.93       101
           3       0.95      0.94      0.95       142

    accuracy                           0.93       480
   macro avg       0.93      0.93      0.93       480
weighted avg       0.93      0.93      0.93       480



In [510]:
print(accuracy_score(y_train,y_pred_train))
print("***********"*10)
print(accuracy_score(y_test,y_pred_test))

0.9910714285714286
**************************************************************************************************************
0.9291666666666667


 With BOW I am getting 93 percent accuracy for test data.lets try this on given test submission data

In [511]:
test.head()

Unnamed: 0,ID,Text
0,1861,Goodbye @kettlesyard see you in .25 years! htt...
1,354,"@BBC_Culture @PlymouthMuseum Oh dear, why not ..."
2,1334,Fantastic @johnmcdonnellMP standing up for wor...
3,906,"@BBC_Culture @PlymouthMuseum Oh dear, why not ..."
4,1290,@britishmuseum @TripAdvisor it is !


In [512]:
text2 = test['Text']

In [513]:
## Pre-processing the text message

message = []
for i in range(len(text2)):
    s = word_tokenize(text2[i])
    temp = []  
    for j in s:
        if len(j) > 1 and j not in stop_words:  
            temp.append(j)
    t = ' '.join(temp)  
    message.append(t)  

print(message)

['Goodbye kettlesyard see .25 years http //t.co/WFjBAiWbfM', "BBC_Culture PlymouthMuseum Oh dear Dame Laura Knight Surely 'The Beach lst", 'Fantastic johnmcdonnellMP standing workers NationalGallery noprivatisatio edvaizey sits fence http //t.co/QzBYpUAtQt', "BBC_Culture PlymouthMuseum Oh dear Dame Laura Knight Surely'The Beach list", 'britishmuseum TripAdvisor', 'britishmuseum thehistoryguy shame idiots post rude cmments throughout', 'Lots ancient artefacts enjoy britishmuseum amazingplace toomuchforaquickvisit http //t.co/utnDOEiGXO', 'Thanks great dance tutors amp joined taster sessions amp flashmob practice ICIABath today', "It 's one thosedays ... NationalGallery sorefeet http //t.co/5cwgqIoGbZ", 'It would great see maerial approached using queer theory foxvertebrae britishmuseum', 'blaircurator alex_neilson Tate_StIves Hang buildinghas renovated great expense ...', 'Now Parliament whistles banner leaflets confiscated manaers NationalGallery time noprivatisation reinstatecandy', '

In [514]:
for i in range(len(test)):
    test.loc[i,'Text'] = message[i]

In [515]:
text2 = test['Text']

In [516]:
corpus = []
for i in range(len(text2)):
    temp = re.sub('[^a-zA-Z]',' ',text2[i])
    temp = temp.lower()
    temp = temp.split()
    temp = [lemma.lemmatize(p) for p in temp if p not in set(stopwords.words('english'))]
    temp = ' '.join(temp)
    corpus.append(temp)

In [517]:
for i in range(len(test)):
    test.loc[i,'Text'] = corpus[i]

In [518]:
test.head()

Unnamed: 0,ID,Text
0,1861,goodbye kettlesyard see year http co wfjbaiwbfm
1,354,bbc culture plymouthmuseum oh dear dame laura ...
2,1334,fantastic johnmcdonnellmp standing worker nati...
3,906,bbc culture plymouthmuseum oh dear dame laura ...
4,1290,britishmuseum tripadvisor


In [519]:
test.shape

(400, 2)

In [520]:
bow_test = cv.fit_transform(test['Text']).toarray()

In [521]:
bow_test

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [522]:
bow_test.shape

(400, 500)

In [523]:
test.head()

Unnamed: 0,ID,Text
0,1861,goodbye kettlesyard see year http co wfjbaiwbfm
1,354,bbc culture plymouthmuseum oh dear dame laura ...
2,1334,fantastic johnmcdonnellmp standing worker nati...
3,906,bbc culture plymouthmuseum oh dear dame laura ...
4,1290,britishmuseum tripadvisor


In [524]:
y_pred_test_submission = xgb_classifier.predict(bow_test)

In [525]:
y_pred_test_submission

array([3, 3, 2, 3, 2, 2, 2, 1, 2, 2, 0, 1, 0, 1, 2, 2, 2, 3, 3, 3, 0, 1,
       3, 2, 1, 2, 0, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 3, 1, 2, 0, 2,
       2, 1, 0, 0, 3, 0, 3, 0, 1, 0, 1, 1, 1, 2, 0, 0, 3, 3, 0, 0, 0, 2,
       2, 2, 3, 2, 0, 1, 1, 0, 2, 1, 2, 1, 3, 0, 2, 2, 2, 2, 0, 2, 2, 2,
       3, 1, 3, 1, 2, 0, 1, 3, 2, 0, 2, 0, 3, 3, 2, 2, 2, 2, 3, 1, 2, 2,
       0, 2, 1, 3, 0, 2, 2, 1, 3, 2, 3, 2, 0, 2, 2, 3, 2, 2, 2, 3, 1, 2,
       0, 3, 1, 3, 2, 2, 0, 0, 2, 3, 2, 1, 2, 2, 3, 2, 2, 1, 3, 0, 1, 2,
       2, 3, 2, 2, 2, 0, 1, 2, 2, 3, 2, 2, 1, 1, 0, 2, 2, 3, 1, 2, 1, 2,
       1, 0, 2, 0, 1, 0, 3, 3, 0, 1, 0, 2, 2, 2, 3, 3, 2, 0, 1, 1, 0, 2,
       2, 1, 2, 2, 1, 0, 2, 2, 3, 0, 3, 3, 1, 2, 2, 3, 2, 3, 0, 2, 2, 0,
       3, 3, 3, 1, 0, 2, 2, 0, 1, 0, 2, 2, 2, 2, 2, 2, 3, 2, 0, 2, 3, 1,
       2, 3, 2, 2, 0, 2, 1, 2, 2, 0, 0, 2, 2, 2, 3, 0, 1, 0, 2, 3, 2, 1,
       1, 3, 3, 2, 1, 2, 0, 2, 1, 0, 1, 2, 2, 2, 0, 2, 2, 3, 2, 3, 0, 2,
       2, 1, 3, 1, 2, 0, 2, 0, 2, 0, 2, 0, 3, 3, 2,

In [526]:
y_pred_test_submission.shape

(400,)

In [527]:
test.head()

Unnamed: 0,ID,Text
0,1861,goodbye kettlesyard see year http co wfjbaiwbfm
1,354,bbc culture plymouthmuseum oh dear dame laura ...
2,1334,fantastic johnmcdonnellmp standing worker nati...
3,906,bbc culture plymouthmuseum oh dear dame laura ...
4,1290,britishmuseum tripadvisor


In [528]:
test.shape

(400, 2)

In [529]:
submission = pd.read_csv('dataset/sample_submission.csv')

In [530]:
submission.head()

Unnamed: 0,ID,Prediction
0,1861,0
1,354,0
2,1334,0
3,906,0
4,1290,0


In [531]:
submission.shape

(400, 2)

In [532]:
for i in range(len(submission)):
    submission.loc[i , 'Prediction'] = y_pred_test_submission[i]

In [533]:
submission.head()

Unnamed: 0,ID,Prediction
0,1861,3
1,354,3
2,1334,2
3,906,3
4,1290,2


In [534]:
## Saving File

In [536]:
submission.to_csv('submission.csv', index=False)  # 'index=False' prevents saving the index column


In [548]:
## TF-IDF

In [549]:
train.head()

Unnamed: 0,ID,Text,Category
0,969,juliabradbury simoncalder walsop hodderpriba t...,0
1,241,http co r easn house solutio proposed meet fle...,1
2,820,britshmuseum thehistoryguy gosh periscope defi...,2
3,693,ophiolatrist britishmuseum stupid french prick...,1
4,421,sassyclde wo n stop til britishmuseum dump bp ...,1


In [551]:
train['Category'].value_counts()/len(train)*100

Category
1    25.5625
3    25.2500
0    24.6250
2    24.5625
Name: count, dtype: float64

## TF-IDF & Random Forest

In [581]:
train.head()

Unnamed: 0,ID,Text,Category
0,969,juliabradbury simoncalder walsop hodderpriba t...,0
1,241,http co r easn house solutio proposed meet fle...,1
2,820,britshmuseum thehistoryguy gosh periscope defi...,2
3,693,ophiolatrist britishmuseum stupid french prick...,1
4,421,sassyclde wo n stop til britishmuseum dump bp ...,1


In [582]:
test.head()

Unnamed: 0,ID,Text
0,1861,Goodbye @kettlesyard see you in .25 years! htt...
1,354,"@BBC_Culture @PlymouthMuseum Oh dear, why not ..."
2,1334,Fantastic @johnmcdonnellMP standing up for wor...
3,906,"@BBC_Culture @PlymouthMuseum Oh dear, why not ..."
4,1290,@britishmuseum @TripAdvisor it is !


In [583]:
x_train , x_test , y_train , y_test = train_test_split(train['Text'] , train['Category'] , 
                                                       test_size=0.3 ,random_state=101)

In [584]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC


In [585]:
tfidf = Pipeline([
                       ('TF-IDF', TfidfVectorizer()),
                       ('Classifier',RandomForestClassifier())
                      ])

In [586]:
tfidf.fit(x_train,y_train)

In [588]:
# Predict the model by using train and test data

y_pred_train = tfidf.predict(x_train)
y_pred_test = tfidf.predict(x_test)

In [589]:
print(confusion_matrix(y_train,y_pred_train))
print("***********"*10)
print(confusion_matrix(y_test,y_pred_test))

[[285   0   0   0]
 [  0 290   0   0]
 [  0   0 275   0]
 [  0   0   0 270]]
**************************************************************************************************************
[[ 93   2  13   1]
 [  0 115   4   0]
 [  6   0 112   0]
 [  3   0   0 131]]


In [590]:
print(classification_report(y_train,y_pred_train))
print("***********"*10)
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       285
           1       1.00      1.00      1.00       290
           2       1.00      1.00      1.00       275
           3       1.00      1.00      1.00       270

    accuracy                           1.00      1120
   macro avg       1.00      1.00      1.00      1120
weighted avg       1.00      1.00      1.00      1120

**************************************************************************************************************
              precision    recall  f1-score   support

           0       0.91      0.85      0.88       109
           1       0.98      0.97      0.97       119
           2       0.87      0.95      0.91       118
           3       0.99      0.98      0.98       134

    accuracy                           0.94       480
   macro avg       0.94      0.94      0.94       480
weighted avg       0.94      0.94      0.94       480



In [591]:
print(accuracy_score(y_train,y_pred_train))
print("***********"*10)
print(accuracy_score(y_test,y_pred_test))

1.0
**************************************************************************************************************
0.9395833333333333


In [592]:
y_pred_test_submission = tfidf.predict(test['Text'])

In [593]:
submission = pd.read_csv('dataset/sample_submission.csv')

In [594]:
for i in range(len(submission)):
    submission.loc[i , 'Prediction'] = y_pred_test_submission[i]

In [595]:
submission.to_csv('submission.csv', index=False)  # 'index=False' prevents saving the index column


## TF-IDF & SVM

In [614]:
svm = Pipeline([
                       ('TF-IDF', TfidfVectorizer()),
                       ('Classifier',SVC())
                      ])

In [615]:
svm.fit(x_train,y_train)

In [616]:
# Predict the model by using train and test data

y_pred_train = svm.predict(x_train)
y_pred_test = svm.predict(x_test)

In [617]:
print(classification_report(y_train,y_pred_train))
print("***********"*10)
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       285
           1       1.00      1.00      1.00       290
           2       1.00      1.00      1.00       275
           3       1.00      1.00      1.00       270

    accuracy                           1.00      1120
   macro avg       1.00      1.00      1.00      1120
weighted avg       1.00      1.00      1.00      1120

**************************************************************************************************************
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       109
           1       0.99      0.97      0.98       119
           2       1.00      0.99      1.00       118
           3       1.00      1.00      1.00       134

    accuracy                           0.99       480
   macro avg       0.99      0.99      0.99       480
weighted avg       0.99      0.99      0.99       480



In [618]:
print(accuracy_score(y_train,y_pred_train))
print("***********"*10)
print(accuracy_score(y_test,y_pred_test))

1.0
**************************************************************************************************************
0.9875


In [619]:
y_pred_test_submission = svm.predict(test['Text'])

In [620]:
for i in range(len(submission)):
    submission.loc[i , 'Prediction'] = y_pred_test_submission[i]

In [621]:
submission.to_csv('submission.csv', index=False)  # 'index=False' prevents saving the index column


In [606]:
classifier = Pipeline([
                       ('TF-IDF', TfidfVectorizer()),
                       ('Classifier',XGBClassifier())
                      ])

In [607]:
classifier.fit(x_train,y_train)

In [608]:
# Predict the model by using train and test data

y_pred_train = classifier.predict(x_train)
y_pred_test = classifier.predict(x_test)

In [609]:
print(classification_report(y_train,y_pred_train))
print("***********"*10)
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       285
           1       1.00      1.00      1.00       290
           2       0.99      1.00      0.99       275
           3       1.00      1.00      1.00       270

    accuracy                           1.00      1120
   macro avg       1.00      1.00      1.00      1120
weighted avg       1.00      1.00      1.00      1120

**************************************************************************************************************
              precision    recall  f1-score   support

           0       0.81      0.75      0.78       109
           1       0.97      0.94      0.96       119
           2       0.83      0.93      0.88       118
           3       0.94      0.92      0.93       134

    accuracy                           0.89       480
   macro avg       0.89      0.89      0.89       480
weighted avg       0.89      0.89      0.89       480



In [610]:
print(accuracy_score(y_train,y_pred_train))
print("***********"*10)
print(accuracy_score(y_test,y_pred_test))

0.9973214285714286
**************************************************************************************************************
0.8895833333333333


In [611]:
y_pred_test_submission = classifier.predict(test['Text'])

In [612]:
for i in range(len(submission)):
    submission.loc[i , 'Prediction'] = y_pred_test_submission[i]

In [613]:
submission.to_csv('submission.csv', index=False)  # 'index=False' prevents saving the index column
