<a href="https://colab.research.google.com/github/NqaaLadadwa/Tweet-Spam-Detection/blob/main/Tweet%20Spam%20Detection(Three_classifiers).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing the Dependencies**

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix

**Data Collection & Pre-Processing**

In [None]:
# loading the data from csv file to a pandas Dataframe
raw_tweet_data = pd.read_csv('/content/train.csv')

In [None]:
print(raw_tweet_data)

          Id  ...     Type
0      10091  ...  Quality
1      10172  ...  Quality
2       7012  ...  Quality
3       3697  ...     Spam
4      10740  ...     Spam
...      ...  ...      ...
11963   7866  ...  Quality
11964   7841  ...  Quality
11965   9090  ...     Spam
11966   6818  ...  Quality
11967   4937  ...     Spam

[11968 rows x 8 columns]


In [None]:
# replace the missing values with new values
tweet_data = raw_tweet_data.fillna({
'Id	':0,
'Tweet': "No Tweet",
'following': 0,
'followers': 0,
'actions': 0,
'is_retweet': "Nothing",
'location': "No place",
'Type': "Not determined"
})

In [None]:
# printing the first 5 rows of the dataframe
tweet_data.head()

Unnamed: 0,Id,Tweet,following,followers,actions,is_retweet,location,Type
0,10091,It's the everything else that's complicated. #...,0.0,11500.0,0.0,0.0,Chicago,Quality
1,10172,Eren sent a glare towards Mikasa then nodded a...,0.0,0.0,0.0,0.0,No place,Quality
2,7012,I posted a new photo to Facebook http://fb.me/...,0.0,0.0,0.0,0.0,"Scotland, U.K",Quality
3,3697,#jan Idiot Chelsea Handler Diagnoses Trump Wit...,3319.0,611.0,294.0,0.0,"Atlanta, Ga",Spam
4,10740,Pedophile Anthony Weiner is TERRIFIED of Getti...,4840.0,1724.0,1522.0,0.0,Blumberg,Spam


In [None]:
# checking the number of rows and columns in the dataframe
tweet_data.shape

(11968, 8)

**Label Encoding**

In [None]:
# label Spam tweet as 0;  Quality tweet as 1;

tweet_data.loc[tweet_data['Type'] == 'Spam', 'Type',] = 0
tweet_data.loc[tweet_data['Type'] == 'Quality', 'Type',] = 1
#These two lines will represent each spam tweet by 0 and each quality tweet by 1

In [None]:
# separating the data as texts and label
# The text (Y) represents the Type column, while the label (X) represents all the rest of input columns (features) 

#So first, we will declare the input features:
X = tweet_data['Tweet']
Y = tweet_data['Type']

In [None]:
print(X)

0        It's the everything else that's complicated. #...
1        Eren sent a glare towards Mikasa then nodded a...
2        I posted a new photo to Facebook http://fb.me/...
3        #jan Idiot Chelsea Handler Diagnoses Trump Wit...
4        Pedophile Anthony Weiner is TERRIFIED of Getti...
                               ...                        
11963                                     11:11 meet harry
11964    If BBC Food disappears the loss of knowledge w...
11965    Look What Liberals Did to This Historic Monume...
11966    I uploaded a new track, "Everyday Lite 1", on ...
11967    Trump should be declared the victor by about 9...
Name: Tweet, Length: 11968, dtype: object


In [None]:
print(Y)

0        1
1        1
2        1
3        0
4        0
        ..
11963    1
11964    1
11965    0
11966    1
11967    0
Name: Type, Length: 11968, dtype: object


**Splitting Data**

**# Now, it is important in machine learning to split the data into training and testing data, in order to train the model and evaluate it.**

In [None]:
#Using the imported function (train_test_split)
#Each data set of the training and testing set must have a specific percentage 
#We gave the testing data 20% and 80% for the training data

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

#In regard to the random state, it is a random value that is used to spilt the data in the same way

In [None]:
print('The whole ratio: ', X.shape)
print('80% (train data):', X_train.shape)
print('20% (test data): ', X_test.shape)

The whole ratio:  (11968,)
80% (train data): (9574,)
20% (test data):  (2394,)


**Feature Extraction**

This part converts all the text input columns to meaning numerical values, to be understood by "LogisticRegression" function

In [None]:
feature_extraction = TfidfVectorizer(min_df = 1, stop_words='english', lowercase='True')
X_train_features = feature_extraction.fit_transform(X_train)
#After fitting the training data into the vectorizer function, we don't need to fit the testing data, we just use them in the same vecotrizer
X_test_features = feature_extraction.transform(X_test)
# convert Y_train and Y_test values as integers to can be understood by the machine
#They were declared as object

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
print('This is the data of X_train before converting to numerical values \n')
print(X_train)

This is the data of X_train before converting to numerical values 

3155     #ObamaNextJob Lobby to require all Cristian's ...
4058                      #iPati playing AGAIN on LesediFM
11396    #ThingsYouCantIgnore  Married couples who are ...
8014     1 suspect in custody, 2 at large after police ...
6991     President Trump is doing what he was elected t...
                               ...                        
9160     Report: NCAA expands Ole Miss investigation  #...
9859     Fly Gal Friday Featured Angler Jenny Tatelman ...
11513    The car is so fed up with Ron and Harry. Throw...
1688     Truck attacker kills 84 celebrating France's B...
5994     Wojo: Skid raises questions about Ausmus futur...
Name: Tweet, Length: 9574, dtype: object


In [None]:
print('This is the data of X_train after converting to numerical values \n')
print(X_train_features)

This is the data of X_train after converting to numerical values 

  (0, 4690)	0.31494070909109995
  (0, 23683)	0.28087664960119274
  (0, 21246)	0.2399421303765932
  (0, 13802)	0.23736837111689602
  (0, 9164)	0.28551417574365323
  (0, 6746)	0.29086780677772134
  (0, 7503)	0.28087664960119274
  (0, 27223)	0.28087664960119274
  (0, 7480)	0.32902245422795
  (0, 21447)	0.29719980833441584
  (0, 15838)	0.31494070909109995
  (0, 18348)	0.29719980833441584
  (1, 15470)	0.6351433419459722
  (1, 19707)	0.4395291462046672
  (1, 13738)	0.6351433419459722
  (2, 21093)	0.3630764323462381
  (2, 7335)	0.5493261220739581
  (2, 16419)	0.5636502786466441
  (2, 25084)	0.498713022953557
  (3, 13786)	0.37704758274301714
  (3, 1509)	0.37704758274301714
  (3, 12796)	0.16002440241580146
  (3, 7331)	0.2522576301373051
  (3, 12161)	0.33332376570056715
  (3, 18113)	0.32718870222889634
  :	:
  (9571, 12164)	0.26697592103424916
  (9571, 15578)	0.19829957554694344
  (9571, 6015)	0.451989303690945
  (9572, 2503)	0.3

**Vectorization using Bag of Words**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer  
matrix=CountVectorizer()

In [None]:
X_train_vect=matrix.fit_transform(X_train).toarray()
X_test_vect=matrix.transform(X_test).toarray()

In [None]:
X_train_vect

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
X_test_vect

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
Y_train

3155     0
4058     1
11396    0
8014     0
6991     0
        ..
9160     0
9859     1
11513    1
1688     0
5994     0
Name: Type, Length: 9574, dtype: int64

In [None]:
Y_test

1744     1
4580     0
10335    1
8717     0
10032    1
        ..
5294     0
9657     0
2343     0
9587     1
3515     0
Name: Type, Length: 2394, dtype: int64

**Training the models**

**1) Logistic Regression**

In [None]:
LR_model = LogisticRegression()

In [None]:
# training the Logistic Regression model with the training data
LR_model.fit(X_train_features, Y_train)

LogisticRegression()

**2) Decision Tree**

In [None]:
dt_model = DecisionTreeClassifier(random_state=10)


In [None]:
dt_model.fit(X_train_features, Y_train)

DecisionTreeClassifier(random_state=10)

**3) Naive Bayes**

In [None]:
NB_model=GaussianNB()

In [None]:
NB_model.fit(X_train_vect,Y_train)

GaussianNB()

In [None]:
NB_model.classes_

array([0, 1])

In [None]:
Y_pred=NB_model.predict(X_test_vect)

In [None]:
Y_pred

array([0, 0, 1, ..., 0, 1, 0])

In [None]:
confusion_matrix(Y_test,Y_pred)

array([[1053,  102],
       [ 540,  699]])

**Evaluating the trained model**

**1) Logistic Regression**

In [None]:
# prediction on training data

prediction_on_training_data = LR_model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [None]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.9747232086902027


In [None]:
# prediction on test data

prediction_on_test_data = LR_model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [None]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.934001670843776


**2) Decision Tree**

In [None]:
# prediction on training data
prediction_on_training_data = dt_model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [None]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  1.0


In [None]:
# prediction on test data

prediction_on_test_data = dt_model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [None]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.9273182957393483


**3) Naive Bayes**

In [None]:
# prediction on training data

prediction_on_training_data = NB_model.predict(X_train_vect)
accuracy_on_training_data = accuracy_score(Y_train, prediction_on_training_data)

In [None]:
print('Accuracy on training data : ', accuracy_on_training_data)

Accuracy on training data :  0.943806141633591


In [None]:
# prediction on test data

prediction_on_test_data = NB_model.predict(X_test_vect)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [None]:
print('Accuracy on test data : ', accuracy_on_test_data)

Accuracy on test data :  0.731829573934837


**The output of the best classifiers**

**1) Logistic Regression**

In [None]:
input_tweet = ["I just keep holding you down"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_tweet)

# making prediction

prediction = LR_model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Quality tweet')

else:
  print('Spam tweet')

[1]
Quality tweet


In [None]:
input_tweet = ["Black slave"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_tweet)

# making prediction

prediction = LR_model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Quality tweet')

else:
  print('Spam tweet')

[0]
Spam tweet


**2) Decision Tree**

In [None]:
input_tweet = ["I just keep holding you down"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_tweet)

# making prediction

prediction = dt_model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Quality tweet')

else:
  print('Spam tweet')

[1]
Quality tweet


In [None]:
input_tweet = ["Black slave"]

# convert text to feature vectors
input_data_features = feature_extraction.transform(input_tweet)

# making prediction

prediction = dt_model.predict(input_data_features)
print(prediction)


if (prediction[0]==1):
  print('Quality tweet')

else:
  print('Spam tweet')

[0]
Spam tweet
