In [None]:
!pip install emoji



In [None]:
import re
import numpy as np
import pandas as pd

from lxml import html
from emoji import demojize

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df = pd.read_csv('/content/drive/MyDrive/tweet_data.csv')
df.head()

Unnamed: 0,text,sentiment
0,Sooo SAD I will miss you here in San Diego!!!,negative
1,my boss is bullying me...,negative
2,what interview! leave me alone,negative
3,"Sons of ****, why couldn`t they put them on t...",negative
4,2am feedings for the baby are fun when he is a...,positive


In [None]:
df['sentiment'].value_counts()

positive    8582
negative    7781
Name: sentiment, dtype: int64

# Data Pre-Processing

In [None]:
stemmer = PorterStemmer()
stop = stopwords.words('english')

def clean_text(text):
    # Convert Emoji to strings
    text = demojize(text)

    # Remove HTML Tags
    try:
        text = html.document_fromstring(text).text_content()
    except:
        pass
    
    # Remove Hyperlinks
    text = re.sub('http\S+', ' ', text)

    # Remove non alphabets
    text = re.sub('[^a-zA-Z ]+', ' ', text)

    # Lowercase and split
    text = text.lower().split()

    # Remove stopwords and short words
    text = [stemmer.stem(word) for word in text if word not in stop and len(word) > 2]

    # Join and Return
    return ' '.join(text)

In [None]:
sample_text = "Hi there! i've been trying this product: for a while now it's 🔥 https://www.amazon.in/s?k=keyboard&page=2"
print(f'Original String: {sample_text}')
print(f'Cleaned String: {clean_text(sample_text)}')

Original String: Hi there! i've been trying this product: for a while now it's 🔥 https://www.amazon.in/s?k=keyboard&page=2
Cleaned String: trying product fire


In [None]:
texts = df['text'].apply(clean_text)
labels = df['sentiment'].map({'positive':1, 'negative': 0})

In [None]:
texts

0                                  sooo sad miss san diego
1                                            boss bullying
2                                    interview leave alone
3                         sons put releases already bought
4                            feedings baby fun smiles coos
                               ...                        
16358                                          enjoy night
16359    wish could come see denver husband lost job af...
16360    wondered rake client made clear net force devs...
16361    yay good enjoy break probably need hectic week...
16362                                                worth
Name: text, Length: 16363, dtype: object

In [None]:
labels

0        0
1        0
2        0
3        0
4        1
        ..
16358    1
16359    0
16360    0
16361    1
16362    1
Name: sentiment, Length: 16363, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(texts, labels, random_state = 0)

# ML For Text Classification

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [None]:
texts_train = tfidf.fit_transform(x_train).todense()
texts_test = tfidf.transform(x_test).todense()

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()

In [None]:
clf.fit(texts_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, clf.predict(texts_test)))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85      1913
           1       0.85      0.90      0.87      2178

    accuracy                           0.86      4091
   macro avg       0.86      0.86      0.86      4091
weighted avg       0.86      0.86      0.86      4091

