In [49]:
#Importing all the necessary libraries

import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
from nltk.corpus import words
nltk.download('stopwords')
# ML Libraries

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [160]:
# define function to read train file

def load_dataset_train(filename):
    dataset_train = pd.read_csv(filename)
    return dataset_train

In [161]:
# define function to read test file

def load_dataset_test(filename):
    dataset_test = pd.read_csv(filename)
    return dataset_test

In [162]:
# define function to remove unwanted columns from train dataset

def remove_unwanted_cols_train(dataset_train, cols):
    for col in cols:
        del dataset_train[col]
    return dataset_train

In [163]:
# define function to remove unwanted columns from test dataset

def remove_unwanted_cols_test(dataset_test, cols):
    for col in cols:
        del dataset_test[col]
    return dataset_test

In [167]:
# define function to clean the tweets

def preprocess_tweet_text(tweet):
    tweet.lower()
    
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    
    # Remove user @ references and '#' from tweet
    tweet = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT)", " ", tweet)
    
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    
    # Remove stopwords
    stop = stopwords.words('english')
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop]
    
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]
    #lemmatizer = WordNetLemmatizer()
    #lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    return " ".join(filtered_words)

In [173]:
# Define function to implement vectorization (convert text to numbers) using tf-idf technique

def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [168]:
# Calling all the function

#load train dataset
dataset_train = load_dataset_train("train_dataset.csv")

#Load test dataset
dataset_test = load_dataset_test("test_dataset.csv")

# Remove unwanted columns from train dataset
n_dataset_train = remove_unwanted_cols_train(dataset_train, ['tweet_source', 'screen_name', 'tweet_id', 'retweet_count', 'account_creation_date', 'location', 'coordinates', 'urls'])

# Remove unwanted columns from test dataset
n_dataset_test = remove_unwanted_cols_test(dataset_test, ["Polarity", "Created_at", "screen_name", "tweet_id"])

In [166]:
dataset_test.head()

Unnamed: 0,Sentiment,tweet_text
0,Neutral,RT @Traycing: @MinteeKneez Do those bodies rep...
1,Positive,RT @DrJenChen4kids: What most of #medtwitter i...
2,Neutral,"RT @FlooringDirecTX: Yes, we're OPEN! Start a ..."
3,Neutral,Sale Sale\nกระดาษชุบแอลกอฮอล์\n1กล่อง 100แผ่น ...
4,Neutral,@MinteeKneez Do those bodies represent the peo...


In [169]:
#Preprocess train data
dataset_train['tweet_text'] = dataset_train['tweet_text'].apply(str)
dataset_train['tweet_text'] = dataset_train['tweet_text'].apply(preprocess_tweet_text)

In [171]:
# Preprocess test data
dataset_test['tweet_text'] = dataset_test['tweet_text'].apply(str)
dataset_test['tweet_text'] = dataset_test['tweet_text'].apply(preprocess_tweet_text)

In [178]:
dataset_test.head()

Unnamed: 0,Sentiment,tweet_text
0,Neutral,Do bodies represent people died Governor Cuomo...
1,Positive,What medtwitter NOT MemorialDayWeekend going b...
2,Neutral,Yes OPEN Start FlooringInstallation project ge...
3,Neutral,Sale Sale 1 100 50 19 19 19 19 19
4,Neutral,Do bodies represent people died Governor Cuomo...


In [174]:
# Split dataset into Train, Test

# Same tf vector will be used for Testing sentiments on unseen trending data
tf_vector = get_feature_vector(np.array(dataset_train.iloc[:, 1]).ravel())
X = tf_vector.transform(np.array(dataset_train.iloc[:, 1]).ravel())
y = np.array(dataset_train.iloc[:, 0]).ravel()
X_train = X
y_train = y
X_test = tf_vector.transform(np.array(dataset_test.iloc[:, 1]).ravel())
y_test = np.array(dataset_test.iloc[:, 0]).ravel()

#train_test_split(X, y, test_size=0.2, random_state=30)

In [139]:
y_test

array(['Neutral', 'Negative', 'Neutral', ..., 'Neutral', 'Neutral',
       'Neutral'], dtype=object)

In [175]:
# Training Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_nb))

0.65831


In [176]:
# Training Logistics Regression model
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

0.70369


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [131]:
# Testing logistic regression model on test data

test_file_name = "Test_dataset.csv"
test_ds = load_dataset(test_file_name)
test_ds = remove_unwanted_cols(test_ds, ["Polarity", "Created_at", "screen_name", "tweet_id"])


In [132]:
test_ds.head()

Unnamed: 0,Sentiment,tweet_text
0,Neutral,RT @Traycing: @MinteeKneez Do those bodies rep...
1,Positive,RT @DrJenChen4kids: What most of #medtwitter i...
2,Neutral,"RT @FlooringDirecTX: Yes, we're OPEN! Start a ..."
3,Neutral,Sale Sale\nกระดาษชุบแอลกอฮอล์\n1กล่อง 100แผ่น ...
4,Neutral,@MinteeKneez Do those bodies represent the peo...


In [179]:
# Using Logistic Regression model for prediction on test data

test_prediction_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, test_prediction_lr))

0.70369


In [183]:
#Print test data
data_tweet = dataset_test['tweet_text']
test_result_ds = pd.DataFrame({'tweet': data_tweet, 'prediction':test_prediction_lr})
test_result_ds.head(50)

Unnamed: 0,tweet,prediction
0,Do bodies represent people died Governor Cuomo...,Neutral
1,What medtwitter NOT MemorialDayWeekend going b...,Neutral
2,Yes OPEN Start FlooringInstallation project ge...,Neutral
3,Sale Sale 1 100 50 19 19 19 19 19,Neutral
4,Do bodies represent people died Governor Cuomo...,Neutral
5,gorjus lashes look order My lash line style Du...,Neutral
6,anything suppress truth If drugs kill anyone d...,Negative
7,There 8 deaths today result COVID19 North This...,Negative
8,maskup Click order covid19 manadatory tanayuhy...,Neutral
9,Monday May 25 2020 9 00 PM EST Total confirmed...,Neutral
