In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('tweets.csv')

In [3]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


In [4]:
# Set the option to display the full text content of DataFrame columns
pd.set_option('display.max_colwidth',None)

In [5]:
df.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,2,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,3,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,4,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,5,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [6]:
df = df.drop("id", axis=1)

In [7]:
df.shape

(7920, 2)

In [8]:
df['label'].value_counts()

0    5894
1    2026
Name: label, dtype: int64

## Preprocessing

In [9]:
# remove puncuations
import string
import re
def remove_pun(text):
    text = ''.join([i for i in text if i not in string.punctuation])
    # removing URL
    text = re.sub(r'http\S+|www\S+|\S+\.com\S+', '', text, flags=re.MULTILINE)
    # removing the tags from the text
    text = re.sub(r'(@\S+) | (#\S+)', r'', text)
    # removing the RT from the text
    text = re.sub(r'\bRT\b', r'', text)
    # removing repeated characters
    return re.sub(r'(.)1+', r'1', text)


In [10]:
 # Tokenize the given text into words
import nltk
nltk.download('punkt')
def tokenize(text):
    words = nltk.word_tokenize(text)
    return words

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [11]:
# Remove stopwords from the given text
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    output = [i for i in text if i not in stopwords]
    return output

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
def stemming(text):
  stem_text = [ps.stem(word) for word in text]
  return stem_text


In [13]:
# Lemmatize each word in the given text
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
wordnet_lemm = WordNetLemmatizer()
def lemma(text):
    lemm_text = [wordnet_lemm.lemmatize(word) for word in text]
    return lemm_text


[nltk_data] Downloading package wordnet to /root/nltk_data...


In [14]:
df.head()

Unnamed: 0,label,tweet
0,0,#fingerprint #Pregnancy Test https://goo.gl/h1MfQV #android #apps #beautiful #cute #health #igers #iphoneonly #iphonesia #iphone
1,0,Finally a transparant silicon case ^^ Thanks to my uncle :) #yay #Sony #Xperia #S #sonyexperias… http://instagram.com/p/YGEt5JC6JM/
2,0,We love this! Would you go? #talk #makememories #unplug #relax #iphone #smartphone #wifi #connect... http://fb.me/6N3LsUpCu
3,0,I'm wired I know I'm George I was made that way ;) #iphone #cute #daventry #home http://instagr.am/p/Li_5_ujS4k/
4,1,What amazing service! Apple won't even talk to me about a question I have unless I pay them $19.95 for their stupid support!


In [15]:
def preprocess(df_col):
  corpus = []
  for item in df_col:
    new_item = remove_pun(item)
    new_item = new_item.lower()
    new_item = tokenize(new_item)
    new_item = remove_stopwords(new_item)
    new_item = lemma(new_item)
    corpus.append(' '.join(str(x) for x in new_item))
  return corpus

In [16]:
corpus = preprocess(df['tweet'])

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train, x_test, y_train, y_test = train_test_split(corpus, df['label'], test_size=0.2, random_state=42)

## Bag-of-Words

In [19]:
# Extracting Features from Cleaned Tweets
# Transform training and test data into feature vectors

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(1,2))
train_vectors = cv.fit_transform(x_train)
test_vectors =cv.transform(x_test)
x = train_vectors
y = y_train

## Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x,y)

In [21]:
y_pred = rf.predict(test_vectors)

In [22]:
from sklearn.metrics import accuracy_score, classification_report

In [23]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.85


## Logistic Regression

In [24]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x, y)

In [25]:
y_pred1 = lr.predict(test_vectors)

accuracy = accuracy_score(y_test, y_pred1)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.88


## XGBoost

In [26]:
import xgboost as xgb
xgb = xgb.XGBClassifier(n_estimators = 100,max_depth=4,random_state=42)
xgb.fit(x,y)

In [27]:
y_pred2 = xgb.predict(test_vectors)
accuracy = accuracy_score(y_test, y_pred2)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.87


## SVM

In [28]:
from sklearn.svm import SVC
svm = SVC(kernel='poly')
svm.fit(x,y)

In [29]:
y_pred3 = svm.predict(test_vectors)
accuracy = accuracy_score(y_test, y_pred3)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.74


In [30]:
# Here logistic regression has the highest accuracy(Accuracy score:0.88).So lets's engage it for classification.
new_tweets = ["I really love the brand!! ","The product is very bad"]
preprocess_tweets = preprocess(new_tweets)
tweets_vec = cv.transform(preprocess_tweets)
predictions = lr.predict(tweets_vec)

In [31]:
for tweet, prediction in zip(new_tweets, predictions):
        if prediction == 1:
            sentiment = "Positive"
        else:
            sentiment = "Negative"
        print(f'Tweet: {new_tweets},\nPreprocessed Tweet: {preprocess_tweets},\nSentiment: {sentiment}')


Tweet: ['I really love the brand!! ', 'The product is very bad'],
Preprocessed Tweet: ['really love brand', 'product bad'],
Sentiment: Negative
Tweet: ['I really love the brand!! ', 'The product is very bad'],
Preprocessed Tweet: ['really love brand', 'product bad'],
Sentiment: Negative
