In [1]:
import pandas as pd
import numpy as np
import spacy

In [3]:
data = pd.read_excel('C:/Users/HP/Desktop/Projects_data/Text Classification/Twitter/text_classification_dataset.xlsx')
data.head()

Unnamed: 0,text,type
0,@ACNI2012 @TheToka920 Never knew having 1 or 2...,sports
1,"MYCA Magical Moments:\n\nSeptember, 2011: Sham...",sports
2,The current state of last year's @BBL finalist...,sports
3,@HOLLYJISOO Why did you bring a cricket...,sports
4,Babar Azam only Pakistani included in the ICC ...,sports


In [4]:
data.text[0]

'@ACNI2012 @TheToka920 Never knew having 1 or 2 followers had anything to do with reality...Malinga has never been s… https://t.co/SSmLS18O4k'

removing '@' and the URL

In [5]:
data['text'] = data['text'].str.replace(r'@|https\S+','',regex=True)
data['text'][1]

'MYCA Magical Moments:\n\nSeptember, 2011: Sham Chotoo of the Bowie Boys and Girls Club joins Maryland Youth Cricket a… '

In [6]:
data['clean_text'] = data['text'].str.replace(r'[^a-zA-Z0-9\s]','',regex=True)         #removed special characters,symbols, punctuations etc.
data['clean_text'] = data['clean_text'].str.replace(r'\n|\s+',' ',regex=True)          # replaced new line or more than one spaces with 1 space
data['clean_text']

0       ACNI2012 TheToka920 Never knew having 1 or 2 f...
1       MYCA Magical Moments  September 2011 Sham Chot...
2       The current state of last years BBL finalists ...
3                  HOLLYJISOO Why did you bring a cricket
4       Babar Azam only Pakistani included in the ICC ...
                              ...                        
1157    The senior is one of the most decorated male t...
1158    2020 COULD be your year to get moving and chan...
1159    RT MailSport I thought you liked yellow on me ...
1160    RT BBCSport Tennis greats played together to r...
1161    RT MattRacquet A thread on hard court sliding ...
Name: clean_text, Length: 1162, dtype: object

In [7]:
nlp = spacy.load('en_core_web_sm')
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [8]:
doc = nlp("The current state of last years BBL finalists StarsBBL P10 W9 L1 RenegadesBBL P10 W1 L9 Cricket BBL09")
for token in doc:
    print(token)

The
current
state
of
last
years
BBL
finalists
StarsBBL
P10
W9
L1
RenegadesBBL
P10
W1
L9
Cricket
BBL09


In [9]:
data

Unnamed: 0,text,type,clean_text
0,ACNI2012 TheToka920 Never knew having 1 or 2 f...,sports,ACNI2012 TheToka920 Never knew having 1 or 2 f...
1,"MYCA Magical Moments:\n\nSeptember, 2011: Sham...",sports,MYCA Magical Moments September 2011 Sham Chot...
2,The current state of last year's BBL finalists...,sports,The current state of last years BBL finalists ...
3,HOLLYJISOO Why did you bring a cricket...,sports,HOLLYJISOO Why did you bring a cricket
4,Babar Azam only Pakistani included in the ICC ...,sports,Babar Azam only Pakistani included in the ICC ...
...,...,...,...
1157,The senior is one of the most decorated male t...,sports,The senior is one of the most decorated male t...
1158,2020 COULD be your year to get moving and chan...,sports,2020 COULD be your year to get moving and chan...
1159,RT MailSport: 'I thought you liked yellow on m...,sports,RT MailSport I thought you liked yellow on me ...
1160,RT BBCSport: ❤️ \n\nTennis greats played toget...,sports,RT BBCSport Tennis greats played together to r...


In [10]:
from spacy.lang.en.stop_words import STOP_WORDS
def preprocess(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if token.text.lower() not in STOP_WORDS]
    lemmas = [token.lemma_ for token in nlp(' '.join(tokens))]
    final_text = ' '.join(lemmas)
    return final_text
data['clean_text'] = data['clean_text'].apply(preprocess)
data['clean_text']

0       acni2012 thetoka920 know have 1 2 follower rea...
1       MYCA Magical Moments    September 2011 Sham Ch...
2       current state year BBL finalist StarsBBL P10 W...
3                                HOLLYJISOO bring cricket
4       Babar Azam Pakistani include ICC ODI team year...
                              ...                        
1157    senior decorate male tennis player northwest L...
1158    2020 year move change live animal Midshore Com...
1159    RT MailSport think like yellow s ok    Grigor ...
1160    RT BBCSport Tennis great play raise money aust...
1161    RT MattRacquet thread hard court slide amp mov...
Name: clean_text, Length: 1162, dtype: object

In [11]:
data

Unnamed: 0,text,type,clean_text
0,ACNI2012 TheToka920 Never knew having 1 or 2 f...,sports,acni2012 thetoka920 know have 1 2 follower rea...
1,"MYCA Magical Moments:\n\nSeptember, 2011: Sham...",sports,MYCA Magical Moments September 2011 Sham Ch...
2,The current state of last year's BBL finalists...,sports,current state year BBL finalist StarsBBL P10 W...
3,HOLLYJISOO Why did you bring a cricket...,sports,HOLLYJISOO bring cricket
4,Babar Azam only Pakistani included in the ICC ...,sports,Babar Azam Pakistani include ICC ODI team year...
...,...,...,...
1157,The senior is one of the most decorated male t...,sports,senior decorate male tennis player northwest L...
1158,2020 COULD be your year to get moving and chan...,sports,2020 year move change live animal Midshore Com...
1159,RT MailSport: 'I thought you liked yellow on m...,sports,RT MailSport think like yellow s ok Grigor ...
1160,RT BBCSport: ❤️ \n\nTennis greats played toget...,sports,RT BBCSport Tennis great play raise money aust...


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

X_train,X_val,y_train,y_val = train_test_split(data['clean_text'],data['type'],test_size=0.2,random_state=1)

vectorizer = TfidfVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_val_vect = vectorizer.transform(X_val)


In [13]:
model = MultinomialNB()
model.fit(X_train_vect,y_train)
y_pred = model.predict(X_val_vect)
y_pred

array(['politics', 'entertainment', 'medical', 'politics', 'medical',
       'entertainment', 'medical', 'politics', 'entertainment',
       'politics', 'sports', 'politics', 'politics', 'politics',
       'entertainment', 'politics', 'medical', 'politics',
       'entertainment', 'politics', 'politics', 'medical', 'politics',
       'medical', 'politics', 'medical', 'sports', 'politics', 'politics',
       'sports', 'sports', 'politics', 'politics', 'entertainment',
       'entertainment', 'medical', 'politics', 'sports', 'entertainment',
       'politics', 'politics', 'politics', 'medical', 'entertainment',
       'politics', 'politics', 'sports', 'politics', 'politics', 'sports',
       'politics', 'politics', 'sports', 'politics', 'politics',
       'entertainment', 'medical', 'politics', 'entertainment', 'medical',
       'sports', 'politics', 'politics', 'politics', 'medical',
       'politics', 'medical', 'entertainment', 'entertainment', 'sports',
       'sports', 'politics', '

In [14]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_val))

               precision    recall  f1-score   support

entertainment       0.77      0.82      0.80        45
      medical       0.67      0.88      0.76        49
     politics       0.88      0.64      0.74        96
       sports       0.79      0.95      0.86        43

     accuracy                           0.78       233
    macro avg       0.78      0.82      0.79       233
 weighted avg       0.80      0.78      0.78       233



In [15]:
data['type'].value_counts()

type
politics         345
medical          299
entertainment    260
sports           258
Name: count, dtype: int64

In [16]:
from sklearn.svm import SVC

model1 = SVC()
model1.fit(X_train_vect,y_train)
pred_y = model1.predict(X_val_vect)

print(classification_report(pred_y,y_val))

               precision    recall  f1-score   support

entertainment       0.67      0.94      0.78        34
      medical       0.62      0.95      0.75        42
     politics       0.97      0.58      0.72       116
       sports       0.77      0.98      0.86        41

     accuracy                           0.77       233
    macro avg       0.76      0.86      0.78       233
 weighted avg       0.83      0.77      0.76       233



In [17]:
from sklearn.ensemble import RandomForestClassifier

model2 = RandomForestClassifier()
model2.fit(X_train_vect,y_train)
y_ = model2.predict(X_val_vect)

print(classification_report(y_,y_val))

               precision    recall  f1-score   support

entertainment       0.67      1.00      0.80        32
      medical       0.66      0.95      0.78        44
     politics       0.99      0.58      0.73       118
       sports       0.75      1.00      0.86        39

     accuracy                           0.78       233
    macro avg       0.76      0.88      0.79       233
 weighted avg       0.84      0.78      0.77       233



Handling imbalanced data

In [20]:
from imblearn.over_sampling import SMOTE

smote = SMOTE()
X_train_over,y_train_over = smote.fit_resample(X_train_vect,y_train)

type
politics         69
medical          64
sports           52
entertainment    48
Name: count, dtype: int64

In [32]:
model.fit(X_train_over,y_train_over)
pred_1 = model.predict(X_val_vect)

print(classification_report(pred_1,y_val))

               precision    recall  f1-score   support

entertainment       0.85      0.71      0.77        58
      medical       0.73      0.81      0.77        58
     politics       0.78      0.78      0.78        69
       sports       0.85      0.92      0.88        48

     accuracy                           0.80       233
    macro avg       0.80      0.80      0.80       233
 weighted avg       0.80      0.80      0.80       233

