Exercise 1

In [9]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [4]:
from textblob import TextBlob

# Loading Burbank text
with open("Burbank.txt", "r") as f:
    burbank_text = f.read()

In [6]:
blob = TextBlob(burbank_text)
sentiment = blob.sentiment

# Polarity ranges from negative -1 to positive +1
print("Polarity:", sentiment.polarity)

# Approx prob of being negative (if polarity < 0)
prob_negative = max(0, -sentiment.polarity)
print("Estimated prob of negative sentiment:", prob_negative)


Polarity: 0.09869334480780263
Estimated prob of negative sentiment: 0


Exercise 2

In [7]:
print("Polarity:", blob.sentiment.polarity)      # -1 to 1
print("Subjectivity:", blob.sentiment.subjectivity)  # 0 (objective) to 1 (subjective)


Polarity: 0.09869334480780263
Subjectivity: 0.3790877796901893


Exercise 3

In [10]:
from textblob import Word

words = [Word(word.lower()) for word in blob.words if word.isalpha()]
word_freq = {}
for word in words:
    word_freq[word] = word_freq.get(word, 0) + 1

# Sorting to get top 10 frequent words (naive topic extraction)
top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:10]
print("Top key topics:")
for word, freq in top_words:
    print(f"{word} ➔ {freq}")


Top key topics:
the ➔ 114
of ➔ 50
to ➔ 39
and ➔ 36
that ➔ 21
in ➔ 20
a ➔ 18
faa ➔ 13
burbank ➔ 12
said ➔ 12


Exercise 4

In [14]:
import pandas as pd

In [18]:
# Loading each dataset
amazon = pd.read_csv("amazon_cells_labelled.txt", sep="\t", header=None, names=["Review", "Label"])
imdb = pd.read_csv("imdb_labelled.txt", sep="\t", header=None, names=["Review", "Label"])
yelp = pd.read_csv("yelp_labelled.txt", sep="\t", header=None, names=["Review", "Label"])


In [19]:
# Adding company labels
amazon["Company"] = "Amazon"
imdb["Company"] = "IMDB"
yelp["Company"] = "Yelp"

# Combining
comb_data = pd.concat([amazon, imdb, yelp], ignore_index=True)
print(comb_data.head())

                                              Review  Label Company
0  So there is no way for me to plug it in here i...      0  Amazon
1                        Good case, Excellent value.      1  Amazon
2                             Great for the jawbone.      1  Amazon
3  Tied to charger for conversations lasting more...      0  Amazon
4                                  The mic is great.      1  Amazon


In [20]:
comb_data.to_csv("Sentiment_Analysis_Dataset.csv", index=False)
print("Columns:", comb_data.columns.tolist())
print("Null values:\n", comb_data.isnull().sum())

Columns: ['Review', 'Label', 'Company']
Null values:
 Review     0
Label      0
Company    0
dtype: int64


In [21]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

nlp = spacy.load("en_core_web_sm")
stopwords = list(STOP_WORDS)
punctuations = string.punctuation

def clean_text(text):
    doc = nlp(text.lower())
    return " ".join([token.lemma_ for token in doc if token.text not in stopwords and token.text not in punctuations])


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.metrics import accuracy_score


In [23]:
# Splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(comb_data['Review'], comb_data['Label'], test_size=0.2, random_state=42)

# Custom transformer
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

In [24]:
# Building pipeline
pipe_countvect = Pipeline([
    ("cleaner", predictors()),
    ("vectorizer", TfidfVectorizer()),
    ("classifier", LinearSVC())
])

# Training
pipe_countvect.fit(X_train, y_train)

# Predicting
y_pred = pipe_countvect.predict(X_test)
print("Accuracy on test data:", accuracy_score(y_test, y_pred))

Accuracy on test data: 0.8072727272727273


In [26]:
sample = "This product is wonderful!"
pred = pipe_countvect.predict([sample])[0]
print(sample, "Prediction ➔", pred)


This product is wonderful! Prediction ➔ 1
