In [2]:
import csv
import string
import re
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [3]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [4]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/prabin_kumar_baniya_np/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
tokenizer = RegexpTokenizer(r'\w+')

In [6]:
en_stopwords = set(stopwords.words("english"))

In [7]:
ps = PorterStemmer()

In [8]:
def getCleanData(text):
    # Converting sting into lower case
    text = str.lower(text)
    text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text)  
    # Generating Patterns for Emoji
    emoji_pattern = re.compile("["
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U00002702-\U000027B0"
                                u"\U000024C2-\U0001F251"
                                "]+", flags=re.UNICODE)  # removing emoticons
    # Removing Emoji from the string
    text = emoji_pattern.sub(r'', text)
    # removes the digits from the string
    text = re.sub(" \d+", " ", text)
    # Removing the white spaces from the beginning and end of string
    text = text.strip()
    # Tokenizing the text
    tokens = tokenizer.tokenize(text)
    # Removing the stop words
    new_tokens = [token for token in tokens if token not in en_stopwords]
    # Stemming
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    clean_text = " ".join(stemmed_tokens)
    return clean_text

In [9]:
# Reading Comments from the csv
comments = pd.read_csv("comments.csv", sep='\t', names=['comment'])
# Loading data from the csv to dataframe and then cleaning the data
df = comments['comment'].apply(getCleanData)
df.replace('\n', 'NaN')
df.replace(' ', 'NaN')

0                                         thank took role
1       awww listen start imagin walter white marri wo...
2                huh chang script walter forti fifti show
3                                                    woke
4                                 hate charact walter man
                              ...                        
1179                                                     
1180                                         favorit song
1181                                      love song choir
1182                                                     
1183                                         favorit song
Name: comment, Length: 1184, dtype: object

In [10]:
df = df.to_numpy()

In [11]:
i=0
# Creating a new csv file with cleaned data
with open("cleanedComments.csv", "w") as csv_file:
    writer = csv.writer(csv_file, delimiter=',')
    for i in range(0, 450):
        str1 = ''.join(str(e) for e in df[i])
        if (str1 != '\n' and str1 != ''):
            csv_file.write(str1 + "\n")
        else:
            continue

In [12]:
# Getting the training and testing data
data_train = pd.read_csv("DataSet.csv", encoding="latin-1")
data_testing = pd.read_csv("cleanedComments.csv", encoding="latin-1", names=["Comment"])
labels = data_train.Sentiment

In [13]:
X = data_train.SentimentText.apply(getCleanData)
y = data_train.Sentiment

In [14]:
y

0        0
1        0
2        1
3        0
4        0
        ..
99983    0
99984    0
99985    1
99986    0
99987    1
Name: Sentiment, Length: 99988, dtype: int64

In [15]:
stopset = set(stopwords.words("english"))

In [16]:
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents="ascii", stop_words=stopset)

In [17]:
X = vectorizer.fit_transform(X)

In [18]:
print(y.shape)
print(X.shape)

(99988,)
(99988, 94005)


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.60, random_state=20)

In [20]:
clf = MultinomialNB()

In [21]:
clf.fit(X_train, y_train)

MultinomialNB()

In [22]:
from sklearn.metrics import roc_auc_score

In [23]:
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.8009619394848285

In [24]:
import numpy as np

In [33]:
comments_array = np.array(["This is a nice movie", "This is a bad movie"])
cleaned_comments = [getCleanData(text) for text in comments_array]
comments_vector = vectorizer.transform(cleaned_comments)

In [34]:
clf.predict(comments_vector)

array([1, 0])