In [117]:
# Set seed for reproducibility
import random; random.seed(53)

# Import all we need from sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics

In [118]:
import pandas as pd

df_train = pd.read_csv('train.csv')
df_train['message'].head(10)

0    PolySciMajor EPA chief doesn't think carbon di...
1    It's not like we lack evidence of anthropogeni...
2    RT @RawStory: Researchers say we have three ye...
3    #TodayinMaker# WIRED : 2016 was a pivotal year...
4    RT @SoyNovioDeTodas: It's 2016, and a racist, ...
5    Worth a read whether you do or don't believe i...
6    RT @thenation: Mike Pence doesn’t believe in g...
7    RT @makeandmendlife: Six big things we can ALL...
8    @AceofSpadesHQ My 8yo nephew is inconsolable. ...
9    RT @paigetweedy: no offense… but like… how do ...
Name: message, dtype: object

In [119]:
df_test = pd.read_csv('test_with_no_labels.csv')
df_test.head()

Unnamed: 0,message,tweetid
0,Europe will now be looking to China to make su...,169760
1,Combine this with the polling of staffers re c...,35326
2,"The scary, unimpeachable evidence that climate...",224985
3,@Karoli @morgfair @OsborneInk @dailykos \nPuti...,476263
4,RT @FakeWillMoore: 'Female orgasms cause globa...,872928


We shall go over several steps to clean the tweets dataset to remove the unnecessary content and highlight the key attributes suitable for the ML model.

### Step 1: Punctuation
The message text has several punctuations. Punctuations are often unnecessary as it doesn’t add value or meaning to the NLP model. The “string” library has 32 punctuations. The punctuations are:

To remove the punctuation in our dataset, let’s create a function and apply the function to the dataset:

In [120]:
import string
string.punctuation

def remove_punctuation(text):
    no_punct=[words for words in text if words not in string.punctuation]
    words_wo_punct=''.join(no_punct)
    return words_wo_punct
df_train['message']=df_train['message'].apply(lambda x: remove_punctuation(x))
df_train.head()

Unnamed: 0,sentiment,message,tweetid
0,1,PolySciMajor EPA chief doesnt think carbon dio...,625221
1,1,Its not like we lack evidence of anthropogenic...,126103
2,2,RT RawStory Researchers say we have three year...,698562
3,1,TodayinMaker WIRED 2016 was a pivotal year in...,573736
4,1,RT SoyNovioDeTodas Its 2016 and a racist sexis...,466954


### Step 2: Tokenization
Tokenizing is the process of splitting strings into a list of words. We will make use of Regular Expressions or regex to do the splitting. Regex can be used to describe a search pattern.

In [121]:
# import re 

# def tokenize(text):
#     split=re.split("\W+",text) 
#     return split
# df_train['message']=df_train['message'].apply(lambda x: tokenize(x.lower()))
# df_train.head()

### Step 3: Stop words

Now, we have a list of words without any punctuation. Let’s go ahead and remove the stop words. Stop words are irrelevant words that won’t help in identifying a text as real or fake. We will use “nltk” library for stop-words and some of the stop words in this library are :

In [122]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [123]:
import nltk

stopword = nltk.corpus.stopwords.words('english')
print(stopword[:11])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've"]


In [124]:
# # The column after step 3 has removed the unnecessary stop words.

# def remove_stopwords(text):
#     text=[word for word in text if word not in stopword]
#     return text
# df_train['message'] = df_train['message'].apply(lambda x: remove_stopwords(x))
# df_train.head()

### Step 4 : Lemmatize/ Stem

Stemming and Lemmatizing is the process of reducing a word to its root form. The main purpose is to reduce variations of the same word, thereby reducing the corpus of words we include in the model. The difference between stemming and lemmatizing is that, stemming chops off the end of the word without taking into consideration the context of the word. Whereas, Lemmatizing considers the context of the word and shortens the word into its root form based on the dictionary definition. Stemming is a faster process compared to Lemmantizing. Hence, it a trade-off between speed and accuracy.

### Step 5: Other steps

Other cleaning steps can be performed based on the data. I have listed a few of them below,

1) Remove URLs
2) Remove HTML tags
3) Remove emoji
4) Remove numbers

In [125]:
print(df_train.shape)
print(df_test.shape)

(15819, 3)
(10546, 2)


In [126]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15819 entries, 0 to 15818
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  15819 non-null  int64 
 1   message    15819 non-null  object
 2   tweetid    15819 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 370.9+ KB


In [127]:
X = df_train['message']
y = df_train.sentiment

In [128]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, 
                 random_state=53)

To

# Models

### Vectorize the tweets
We have the training and testing data all set up, but we need to create vectorized representations of the tweets in order to apply machine learning.

To do so, we will utilize the CountVectorizer and TfidfVectorizer classes which we will first need to fit to the data.

Once this is complete, we can start modeling with the new vectorized tweets!

In [129]:
# Initialize a CountVectorizer object: count_vectorizer
count_vectorizer = CountVectorizer(stop_words='english')

# Transform the training data using only the 'text' column values: count_train 
count_train = count_vectorizer.fit_transform(X_train)

# Transform the test data using only the 'text' column values: count_test 
count_test = count_vectorizer.transform(X_test)

# Print the first 10 features of the count_vectorizer
print(count_vectorizer.get_feature_names()[:10])

['000005', '009barca', '010536', '012', '02', '02cents0', '0519am', '07', '094', '0bamas']


In [134]:
# Initialize a TfidfVectorizer object: tfidf_vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test.values)

# Print the first 10 features
print(tfidf_vectorizer.get_feature_names()[:10])

# Print the first 5 vectors of the tfidf training data
print(tfidf_train.A[:5])


['000005', '009barca', '010536', '012', '02', '02cents0', '0519am', '07', '094', '0bamas']
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [135]:
# Create the CountVectorizer DataFrame: count_df
count_df = pd.DataFrame(count_train.A, columns=count_vectorizer.get_feature_names())

# Create the TfidfVectorizer DataFrame: tfidf_df
tfidf_df = pd.DataFrame(tfidf_train.A, columns=tfidf_vectorizer.get_feature_names())

# Print the head of count_df
print(count_df.head())

# Print the head of tfidf_df
print(tfidf_df.head())

# Calculate the difference in columns: difference
difference = set(count_df.columns) - set(tfidf_df.columns)
print(difference)

# Check whether the DataFrames are equal
print(count_df.equals(tfidf_df))


   amp  believe  change  climate  doesnt  global  rt  trump  warming
0    0        0       2        1       0       0   0      1        0
1    0        0       1        1       0       0   0      0        0
2    0        0       1        1       0       0   1      0        0
3    0        0       0        0       0       1   1      1        1
4    0        0       1        1       0       0   0      0        0
   000005  009barca  010536  012   02  02cents0  0519am   07  094  0bamas  \
0     0.0       0.0     0.0  0.0  0.0       0.0     0.0  0.0  0.0     0.0   
1     0.0       0.0     0.0  0.0  0.0       0.0     0.0  0.0  0.0     0.0   
2     0.0       0.0     0.0  0.0  0.0       0.0     0.0  0.0  0.0     0.0   
3     0.0       0.0     0.0  0.0  0.0       0.0     0.0  0.0  0.0     0.0   
4     0.0       0.0     0.0  0.0  0.0       0.0     0.0  0.0  0.0     0.0   

   ...   เล  และ   ได  と通知した  どうなる米国  スタリん時代のソ連や毛沢東の文化大革命並のサイエンスに政治的介入だ  \
0  ...  0.0  0.0  0.0    0.0     0.0            

In [137]:
# Import the necessary modules
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics


# Instantiate a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(count_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(count_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels=[1,2])
print(cm)


0.5364872629764413
[[2213  214]
 [ 779  281]]


In [139]:
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(tfidf_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred, labels = [1,2])
print(cm)


0.6238268530932771
[[2794   23]
 [ 780  408]]


In [141]:
import numpy as np

# Create the list of alphas: alphas
alphas = np.arange(0, 1, .1)

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(tfidf_train, y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(tfidf_test)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test, pred)
    return score

# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()
    

Alpha:  0.0
Score:  0.6843516567707336

Alpha:  0.1
Score:  0.7013981995786248

Alpha:  0.2
Score:  0.6855008619038498

Alpha:  0.30000000000000004
Score:  0.6736257421949818

Alpha:  0.4
Score:  0.6613675541084083

Alpha:  0.5
Score:  0.6506416395326565

Alpha:  0.6000000000000001
Score:  0.6441294771116645

Alpha:  0.7000000000000001
Score:  0.6381919172572305

Alpha:  0.8
Score:  0.6357019728021451

Alpha:  0.9
Score:  0.6265083317372151





In [142]:
# Get the class labels: class_labels
class_labels = nb_classifier.classes_

# Extract the features: feature_names
feature_names = tfidf_vectorizer.get_feature_names()

# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb_classifier.coef_[0], feature_names))

# Print the first class label and the top 20 feat_with_weights entries
print(class_labels[0], feat_with_weights[:20])

# Print the second class label and the bottom 20 feat_with_weights entries
print(class_labels[1], feat_with_weights[-20:])


-1 [(-10.21383886170312, '000005'), (-10.21383886170312, '009barca'), (-10.21383886170312, '010536'), (-10.21383886170312, '012'), (-10.21383886170312, '02'), (-10.21383886170312, '02cents0'), (-10.21383886170312, '0519am'), (-10.21383886170312, '07'), (-10.21383886170312, '094'), (-10.21383886170312, '0x526978'), (-10.21383886170312, '100000'), (-10.21383886170312, '10000yr'), (-10.21383886170312, '1000s'), (-10.21383886170312, '1001'), (-10.21383886170312, '100daysofshame'), (-10.21383886170312, '100h'), (-10.21383886170312, '100isnow'), (-10.21383886170312, '100s'), (-10.21383886170312, '100th'), (-10.21383886170312, '100x')]
0 [(-8.208423438822152, 'noncompetitive'), (-8.197428484680545, 'make'), (-8.179573538953877, 'chinese'), (-8.166554237374992, 'hell'), (-8.159452954792574, 'money'), (-8.131251165939933, 'just'), (-8.116307938134558, 'fake'), (-8.09915477028163, 'manmade'), (-8.069777588427261, 'hoax'), (-7.996223622190371, 'amp'), (-7.984172853819045, 'man'), (-7.909185107947



## Separate from above

In [130]:
# Initialize count vectorizer
count_vectorizer = CountVectorizer(stop_words='english', 
                                   min_df=0.05, max_df=0.9)

# Create count train and test variables
count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

# Initialize tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', 
                                   min_df=0.05, max_df=0.9)

# Create tfidf train and test variables
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

### Training a multinomial naive Bayes model
Now that we have the data in vectorized form, we can train the first model. Investigate using the Multinomial Naive Bayes model with both the CountVectorizer and TfidfVectorizer data. Which do will perform better? How come?

To assess the accuracies, we will print the test sets accuracy scores for both models.

In [143]:
# Create a MulitnomialNB model
tfidf_nb = MultinomialNB()
tfidf_nb.fit(tfidf_train, y_train)

# Run predict on your TF-IDF test data to get your predictions
tfidf_nb_pred = tfidf_nb.predict(tfidf_test)

# Calculate the accuracy of your predictions
tfidf_nb_score = metrics.accuracy_score(y_test, tfidf_nb_pred)

# Create a MulitnomialNB model
count_nb = MultinomialNB()
count_nb.fit(count_train, y_train)

# Run predict on your count test data to get your predictions
count_nb_pred = count_nb.predict(count_test)

# Calculate the accuracy of your predictions
count_nb_score = metrics.accuracy_score(y_test, count_nb_pred)

print('NaiveBayes Tfidf Score: ', tfidf_nb_score)
print('NaiveBayes Count Score: ', count_nb_score)

NaiveBayes Tfidf Score:  0.6238268530932771
NaiveBayes Count Score:  0.5364872629764413


### Trying out another classifier: Linear SVC

In [150]:
tfidf_svc = LinearSVC()
tfidf_svc.fit(tfidf_train, y_train)
tfidf_svc_pred = tfidf_svc.predict(tfidf_test)
tfidf_svc_score = metrics.accuracy_score(y_test, tfidf_svc_pred)

print("LinearSVC Score:   %0.3f" % tfidf_svc_score)

svc_cm = metrics.confusion_matrix(y_test, tfidf_svc_pred, labels=[1, 2])
print(svc_cm)
 

LinearSVC Score:   0.725
[[2445  212]
 [ 305  842]]


In [158]:
from sklearn.metrics import classification_report

print('Classification Report')
print(classification_report(y_test, tfidf_svc_pred, labels=[1, 2]))

Classification Report
              precision    recall  f1-score   support

           1       0.75      0.87      0.80      2817
           2       0.72      0.71      0.72      1188

   micro avg       0.74      0.82      0.78      4005
   macro avg       0.74      0.79      0.76      4005
weighted avg       0.74      0.82      0.78      4005

