# Installing and importing libraries

In [21]:
# for text manipulation
import nltk 
import string
import re

# for data manipulation
import pandas as pd
import numpy as np

# importing different libraries for analysis, processing and classification
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize 
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer

# performance evaluation criteria 
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier

## Importing the Dataset

In [2]:
tweets = pd.read_csv('sentiment_tweets3.csv')

In [3]:
tweets.head()

Unnamed: 0.1,Unnamed: 0,message,label
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


## Data Cleaning

In [4]:
tweets.drop(['Unnamed: 0'], axis=1, inplace=True)

In [5]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10314 entries, 0 to 10313
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   message  10314 non-null  object
 1   label    10314 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 161.3+ KB


In [6]:
tweets['label'].value_counts()

0    8000
1    2314
Name: label, dtype: int64

In [7]:
def clean_message(message):
    # Removing usernames
    message = re.sub(r'\w*@\w*', '', message)

    # Removing URLs
    message = re.sub(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', '', message, flags=re.MULTILINE)
    
    # Removing everything but letters
    message = re.sub("^a-zA-Z", "", message)

    stop_words = stopwords.words('english')
    tokens = word_tokenize(message)
    
    cleaned_tokens = []
    
    for token, tag in pos_tag(tokens):
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
            
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)
        
        if token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token)
    
    cleaned_text = ' '.join(cleaned_tokens)
    return cleaned_text

In [8]:
processed_features = []
for sentence in range(0, len(tweets.message)):
    cleaned = clean_message(tweets.message[sentence])
    
    processed_features.append(cleaned)

## Bag of Words

In [9]:
vectorizer = TfidfVectorizer(max_features=2500, min_df=6, max_df=0.8)
processed_features = vectorizer.fit_transform(processed_features).toarray()

## Splitting our data set into training and testing

In [10]:
X_train, X_test, y_train, y_test = train_test_split(processed_features, tweets.label, test_size=0.3, random_state=0)

## Training the model

In [11]:
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [12]:
predictions = text_classifier.predict(X_test)

In [13]:
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2397
           1       1.00      0.99      0.99       698

   micro avg       1.00      1.00      1.00      3095
   macro avg       1.00      0.99      1.00      3095
weighted avg       1.00      1.00      1.00      3095

0.9977382875605816


## Test

In [17]:
sample = "Working on the Electronic Press Kit for THE BATEMAN LECTURES ON DEPRESSION today. Here's what I have so far."

In [29]:
cleaned = clean_message(sample)

vectorized = vectorizer.transform([cleaned])
print(text_classifier.predict(vectorized))

ValueError: cannot use sparse input in 'SVC' trained on dense data