# Installing and importing libraries

In [1]:
# for text manipulation
import nltk 
import string
import re

# for data manipulation
import pandas as pd
import numpy as np

# importing different libraries for analysis, processing and classification
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize 
from nltk.tag import pos_tag

from sklearn.feature_extraction.text import TfidfVectorizer

# performance evaluation criteria 
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

## Importing the Dataset

In [11]:
tweets = pd.read_csv('sentiment_tweets3.csv')

In [12]:
tweets.head()

Unnamed: 0.1,Unnamed: 0,message,label
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


## Data Cleaning

In [13]:
tweets.drop(['Unnamed: 0'], axis=1, inplace=True)

In [14]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10314 entries, 0 to 10313
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   message  10314 non-null  object
 1   label    10314 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 161.3+ KB


In [15]:
tweets['label'].value_counts()

0    8000
1    2314
Name: label, dtype: int64

In [16]:
def clean_message(message):
    # Removing usernames
    message = re.sub(r'\w*@\w*', '', message)

    # Removing URLs
    message = re.sub(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*', '', message, flags=re.MULTILINE)
    
    # Removing everything but letters
    message = re.sub("^a-zA-Z", "", message)

    stop_words = stopwords.words('english')
    tokens = word_tokenize(message)
    
    cleaned_tokens = []
    
    for token, tag in pos_tag(tokens):
        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
            
        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)
        
        if token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token)
    
    return cleaned_tokens

In [17]:
processed_features = []
for sentence in range(0, len(tweets.message)):
    cleaned = clean_message(tweets.message[sentence])
    for word in cleaned:
        processed_features.append(word)

## Bag of Words

In [21]:
len(processed_features)

91594

In [22]:
vectorizer = TfidfVectorizer(max_features=2500, min_df=6, max_df=0.8)
processed_features = vectorizer.fit_transform(processed_features).toarray()

## Splitting our data set into training and testing

In [23]:
X_train, X_test, y_train, y_test = train_test_split(processed_features, tweets.label, test_size=0.2, random_state=0)

ValueError: Found input variables with inconsistent numbers of samples: [91594, 10314]