# Exercise 1. Sentiment Analysis

**1. Using textblob, what is the probability that the sentiment in the
Burbank text is going to negative?**

In [174]:
# Import necessary libraries
from textblob import TextBlob

# Open Burbank.txt file
file_path = "/content/Burbank.txt"
with open(file_path, 'r') as file:
    burbank_text = file.read()
    print('Burbank_text :','\n', burbank_text, '\n')

# Create a TextBlob object
blob = TextBlob(burbank_text)

# Get the sentiment polarity score (ranges from -1 for very negative to +1 for very positive)
sentiment_polarity = blob.sentiment.polarity
print("Sentiment polarity score : ",sentiment_polarity,'\n')

# Probability of negative sentiment
probability_neg = max(0, (1 - sentiment_polarity) / 2)
print(f"Probability of negative sentiment : {probability_neg}")

Burbank_text : 
 ﻿Airport task force to consider possible actions to abate noise issues Members of the Southern San Fernando Valley Airplane Noise Task Force will have a little over a month to develop recommendations about how to address aircraft noise issues affecting the south San Fernando Valley After five meetings and hearing from various neighborhood groups and experts, aviation consultant HMMH Inc., which has been the facilitator of the meetings, spent the majority of Wednesday night distilling all of the recommendations that have been brought forward so far and the feasibility of implementing them 
HMMH worked with Kevin Karpe, an aviation expert who worked as an air traffic controller at Hollywood Burbank and the Federal Aviation Administration Southern California Terminal Radar Approach Control, to analyze the departing flight procedures out of the Burbank airfield as well as Van Nuys Airport, both of which are at the center of the noise issues 
Since March 2017, residents in 

# Exercise 2. Sentiment Analysis

**1. Using the data from exercise 1 and textblob, what is the overall sentiment and subjectivity?**

In [175]:
blob.sentiment

Sentiment(polarity=0.09869334480780263, subjectivity=0.3790877796901893)

In [176]:
# Interpret sentiment polarity
# Range [-1, 1] (negative to positive)
if blob.sentiment.polarity > 0:
    print("Overall Sentiment: Positive")
elif blob.sentiment.polarity < 0:
    print("Overall Sentiment: Negative")
else:
    print("Overall Sentiment: Neutral")
print(f"Sentiment Polarity: {blob.sentiment.polarity}",'\n')

# Interpret sentiment subjectivity
# Range [0, 1] (objective to subjective)
if blob.sentiment.subjectivity > 0.5:
    print("Overall Subjectivity: Subjective")
else:
    print("Overall Subjectivity: Objective")
print(f"Sentiment Subjectivity: {blob.sentiment.subjectivity}")

Overall Sentiment: Positive
Sentiment Polarity: 0.09869334480780263 

Overall Subjectivity: Objective
Sentiment Subjectivity: 0.3790877796901893


# Exercise 3. Key topic using ‘Word’ from textblob (very simple way to determine the key topics) based on the Burbank text file.

**1. Import Word from textblob. Identify the key topics by using Word from textblob.**

In [177]:
# Download necessary resources
import nltk
nltk.download('brown')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [178]:
# Import necessary libraries
from textblob import TextBlob, Word
import re
from collections import Counter
import pandas as pd

# Function to preprocess the text
def preprocess_text(text):
    # Remove special characters and numbers using regular expressions
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Function to lemmatize words using Word class
def lemmatize_words(words):
    lemmatized_words = [Word(word).lemmatize() for word in words]
    return lemmatized_words

# Function to extract and lemmatize nouns from the text
def extract_nouns(text):
    blob = TextBlob(text)
    # Extract nouns using part-of-speech tagging
    nouns = [word for word, pos in blob.tags if pos == 'NN' or pos == 'NNS']
    # Lemmatize nouns using the Word class to convert plural to singular
    lemmatized_nouns = lemmatize_words(nouns)
    return lemmatized_nouns

# Function to identify the key topics based on noun frequency
def get_key_topics(nouns, top_n=10):
    # Count the frequency of each noun
    noun_counts = Counter(nouns)
    # Get the most common nouns (key topics)
    key_topics = noun_counts.most_common(top_n)
    return key_topics

# Load the Burbank text file (replace 'burbank_text.txt' with the actual file path)
with open('/content/Burbank.txt', 'r') as file:
    text = file.read()

# Preprocess the text
cleaned_text = preprocess_text(text)

# Extract and lemmatize nouns from the cleaned text
nouns = extract_nouns(cleaned_text)

# Get the top 10 key topics based on noun frequency
key_topics = get_key_topics(nouns, top_n=10)

# Create pandas dataframe and display the key topics
df = pd.DataFrame(key_topics, columns=['Phrase', 'Count'])
df.head(10)

Unnamed: 0,Phrase,Count
0,flight,15
1,task,10
2,force,10
3,valley,10
4,burbank,10
5,noise,9
6,recommendation,9
7,group,9
8,plane,9
9,meeting,8


# Exercise 4. Sentiment analysis with spaCy

**1. Load the datasets ‘amazon_cells_labelled.txt’, ‘imdb_labelled.txt’, ‘yelp_labelled.txt’**

In [179]:
# Import necessary libraries
import pandas as pd

# Load the dataset
amazon_df = pd.read_csv('/content/amazon_cells_labelled.txt', sep='\t', names=['sentence', 'label'])
amazon_df.head()

Unnamed: 0,sentence,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [180]:
# Load the dataset
imdb_df = pd.read_csv('/content/imdb_labelled.txt', sep='\t', names=['sentence', 'label'])
imdb_df.head()

Unnamed: 0,sentence,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [181]:
# Load the dataset
yelp_df = pd.read_csv('/content/yelp_labelled.txt', sep='\t', names=['sentence', 'label'])
yelp_df.head()

Unnamed: 0,sentence,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


**2. Create ‘combined_col’ by joining the tables such that combined_col=[data_amazon, data_imdb, data_yelp]**

In [182]:
# Create 'combined' as a list of data from each dataframe
combined = [amazon_df, imdb_df, yelp_df]
print(combined)

[                                              sentence  label
0    So there is no way for me to plug it in here i...      0
1                          Good case, Excellent value.      1
2                               Great for the jawbone.      1
3    Tied to charger for conversations lasting more...      0
4                                    The mic is great.      1
..                                                 ...    ...
995  The screen does get smudged easily because it ...      0
996  What a piece of junk.. I lose more calls on th...      0
997                       Item Does Not Match Picture.      0
998  The only thing that disappoint me is the infra...      0
999  You can not answer calls with the unit, never ...      0

[1000 rows x 2 columns],                                               sentence  label
0    A very, very, very slow-moving, aimless movie ...      0
1    Not sure who was more lost - the flat characte...      0
2    Attempting artiness with black & white

In [183]:
# Concatenate the dataframes into a single dataframe
combined_col = pd.concat([amazon_df, imdb_df, yelp_df], ignore_index=True)
combined_col.head()

Unnamed: 0,sentence,label
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


**3. Check the structure of data_amazon**

In [184]:
amazon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  1000 non-null   object
 1   label     1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [185]:
print("Null values in amazon_data : ",'\n',amazon_df.isnull().sum())

Null values in amazon_data :  
 sentence    0
label       0
dtype: int64


**4. Add headers for columns in each dataset: ‘Review’ and ‘Label’**

In [186]:
# Rename columns to 'Review' and 'Label' for each dataframe
amazon_df = amazon_df.rename(columns={'sentence': 'Review', 'label': 'Label'})
imdb_df = imdb_df.rename(columns={'sentence': 'Review', 'label': 'Label'})
yelp_df = yelp_df.rename(columns={'sentence': 'Review', 'label': 'Label'})

**5. Create a “Company’ column to identify each company ‘Amazon’, ‘imdb’, and ‘yelp’**

In [187]:
# Add a 'Company' column to each DataFrame
amazon_df['Company'] = 'Amazon'
imdb_df['Company'] = 'imdb'
yelp_df['Company'] = 'yelp'

# Now concatenate the DataFrames
comb_data = pd.concat([amazon_df, imdb_df, yelp_df], ignore_index=True)
comb_data.head()

Unnamed: 0,Review,Label,Company
0,So there is no way for me to plug it in here i...,0,Amazon
1,"Good case, Excellent value.",1,Amazon
2,Great for the jawbone.,1,Amazon
3,Tied to charger for conversations lasting more...,0,Amazon
4,The mic is great.,1,Amazon


**6. Explore the structure of the new dataset called ‘comb_data’**

In [188]:
# Explore the structure of 'comb_data'
comb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2748 entries, 0 to 2747
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Review   2748 non-null   object
 1   Label    2748 non-null   int64 
 2   Company  2748 non-null   object
dtypes: int64(1), object(2)
memory usage: 64.5+ KB


In [189]:
# Check the number of rows and columns
print(comb_data.shape,'\n')

# Check the distribution of labels across different companies
print(comb_data.groupby(['Company', 'Label'])['Review'].count())

(2748, 3) 

Company  Label
Amazon   0        500
         1        500
imdb     0        362
         1        386
yelp     0        500
         1        500
Name: Review, dtype: int64


**7. Use ‘comb_data.to_csv’ to create the ‘Sentiment_Analysis_Dataset’**

In [190]:
comb_data.to_csv('Sentiment_Analysis_Dataset.csv', index=False)

**8. Print the columns**

In [191]:
print(comb_data.columns)

Index(['Review', 'Label', 'Company'], dtype='object')


**9. Check for null values**

In [192]:
# Check for null values in each DataFrame
print("Null values in amazon_df:\n", amazon_df.isnull().sum())
print("\nNull values in imdb_df:\n", imdb_df.isnull().sum())
print("\nNull values in yelp_df:\n", yelp_df.isnull().sum())

# Check for null values in combined dataframe
print("\nNull values in comb_data:\n", comb_data.isnull().sum())

Null values in amazon_df:
 Review     0
Label      0
Company    0
dtype: int64

Null values in imdb_df:
 Review     0
Label      0
Company    0
dtype: int64

Null values in yelp_df:
 Review     0
Label      0
Company    0
dtype: int64

Null values in comb_data:
 Review     0
Label      0
Company    0
dtype: int64


**10. Import STOP_WORDS from spacy and stopwords from spacy.lang.en.stop_words**

In [193]:
# Import necessary libraries
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# print stopwords
stopwords = list(STOP_WORDS)
print(stopwords)

['nothing', 'somewhere', 'quite', 'down', 'this', 'via', 'whether', 'behind', 'just', 'herein', 'please', 'first', '’ll', 'nowhere', 'whither', 'the', 'noone', "'ll", 'mostly', 'another', 'hereupon', 'more', 'ours', 'several', 'make', 'namely', 'beside', 'empty', 'hence', 'would', 'somehow', 'thus', 'how', 'former', 'under', 'per', 'name', 'must', 'he', 'see', 'yourself', 'next', 'only', 'anyone', 'such', 'thru', 'seemed', 'any', 'my', 'still', 'wherever', 'it', 'everyone', 'should', 'from', 'been', '‘ll', 'whenever', 'and', 'else', 'not', 'whereby', 'as', 'doing', 'becomes', 'both', 'then', 'formerly', 'among', 'three', 'i', 'a', 'themselves', '‘re', 'anyhow', 'mine', 'thereby', 'that', 'seem', 'put', 'them', 'due', 'what', 'so', 'upon', 'full', 'after', 'however', 'all', 'go', 'often', 'many', 'give', 'either', 'even', 'for', 'together', 'herself', '’m', 'against', 'call', 'can', 'nevertheless', 'where', 'back', 'we', 'rather', 'seems', "'d", 'while', 'through', 'indeed', 'no', 'used

**11. Build a list of stopwords for filtering**

In [194]:
# Can add more stopwords as per our requirement
stopwords.extend(['would', 'could', 'should', 'one', 'two', 'three'])
print(stopwords)

['nothing', 'somewhere', 'quite', 'down', 'this', 'via', 'whether', 'behind', 'just', 'herein', 'please', 'first', '’ll', 'nowhere', 'whither', 'the', 'noone', "'ll", 'mostly', 'another', 'hereupon', 'more', 'ours', 'several', 'make', 'namely', 'beside', 'empty', 'hence', 'would', 'somehow', 'thus', 'how', 'former', 'under', 'per', 'name', 'must', 'he', 'see', 'yourself', 'next', 'only', 'anyone', 'such', 'thru', 'seemed', 'any', 'my', 'still', 'wherever', 'it', 'everyone', 'should', 'from', 'been', '‘ll', 'whenever', 'and', 'else', 'not', 'whereby', 'as', 'doing', 'becomes', 'both', 'then', 'formerly', 'among', 'three', 'i', 'a', 'themselves', '‘re', 'anyhow', 'mine', 'thereby', 'that', 'seem', 'put', 'them', 'due', 'what', 'so', 'upon', 'full', 'after', 'however', 'all', 'go', 'often', 'many', 'give', 'either', 'even', 'for', 'together', 'herself', '’m', 'against', 'call', 'can', 'nevertheless', 'where', 'back', 'we', 'rather', 'seems', "'d", 'while', 'through', 'indeed', 'no', 'used

**12. Import string, define ‘punctuations’ and define a ‘parser’**

In [195]:
# Import string, define punctuations, and define a parser
import string

# Define punctuations
punctuations = string.punctuation

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Define a parser function to filter out stopwords and punctuations
def parser(text):
    if isinstance(text, str):
        doc = nlp(text)
        return [token.text.lower() for token in doc if
            token.text.lower() not in stopwords and
            token.text not in punctuations and
            token.is_alpha]


**13. Tokenize the sentences**

In [196]:
# Apply the parser function to the 'Review' column in the DataFrame
comb_data['Review'] = comb_data['Review'].apply(lambda x: parser(x))

# Display the DataFrame with the parsed reviews
print(comb_data)

                                                 Review  Label Company
0                                [way, plug, converter]      0  Amazon
1                        [good, case, excellent, value]      1  Amazon
2                                      [great, jawbone]      1  Amazon
3     [tied, charger, conversations, lasting, minute...      0  Amazon
4                                          [mic, great]      1  Amazon
...                                                 ...    ...     ...
2743            [think, food, flavor, texture, lacking]      0    yelp
2744                        [appetite, instantly, gone]      0    yelp
2745                               [overall, impressed]      0    yelp
2746  [experience, underwhelming, think, ninja, sush...      0    yelp
2747  [wasted, life, poured, salt, wound, drawing, t...      0    yelp

[2748 rows x 3 columns]


**14. Import ‘CountVectorizer’, ‘TfidVectorizer’, ‘accuracy_score’, ‘TransformerMixin’, ‘Pipeline’, and ‘LinearSVC’**

In [197]:
# Import necessary libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

**15. Create a class 'predictors(TransformerMixin)'. Within the class, define 'transform', 'fit', and 'get_params'**

**16. Create a basic function to clean the text**

In [198]:
import re

# Create a class 'predictors' that extends TransformerMixin
class predictors(TransformerMixin):

    # Fit method
    def fit(self, X, y=None):
        return self

    # Transform method for cleaning the text
    def transform(self, X):
        return [self._clean_text(text) for text in X]

    # Helper function to clean the text
    def _clean_text(self, text):
        # Ensure the text is a string
        if not isinstance(text, str):
            text = str(text)

        # Remove special characters and numbers
        text = re.sub(r'[^A-Za-z\s]', '', text)
        # Convert to lowercase
        text = text.lower()
        return text


    # Get params method
    def get_params(self, deep=True):
        return {}

**17. Vectorize and use LinearSVC as a classifier**

**18. Use TfidfVectorizer**

In [199]:
# Vectorize using TfidfVectorizer and classify with LinearSVC
pipe_countvect = Pipeline([
    ('cleaner', predictors()),            # Custom text cleaning
    ('vectorizer', TfidfVectorizer()),    # Vectorizer (TF-IDF)
    ('classifier', LinearSVC())           # Linear Support Vector Classifier
])


**19. Split the ‘com_data’ dataset into a train and test (20%) set**

In [200]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(comb_data['Review'], comb_data['Label'], test_size=0.2, random_state=17)

**20. Create a pipeline to clean, tokenize, vectorize, and classify as ‘pipe_countvect'**

In [201]:
pipe_countvect = Pipeline([
    ('cleaner', predictors()),            # Custom text cleaning
    ('vectorizer', TfidfVectorizer()),    # Vectorizer (TF-IDF)
    ('classifier', LinearSVC())           # Linear Support Vector Classifier
])

**21. Fit the data**

In [202]:
# Fit the pipeline on the training data
pipe_countvect.fit(X_train, y_train)


**22. Predict with the test dataset**

In [203]:
# Predict on the test dataset
y_test_pred = pipe_countvect.predict(X_test)

**23. Prediction results as ‘1’ for positive reviews, and ‘0’ for negative reviews**

**24. Use print(sample, “Prediction➔”, pred)**

In [204]:
# Output the test predictions
for sample, pred in zip(X_test, y_test_pred):
    print(f"{sample} : Prediction --> {pred}")

[] : Prediction --> 0
['know', 'sounds', 'funny', 'like', 'sketchy', 'technology', 'work', 'works', 'great'] : Prediction --> 1
['ordered', 'voodoo', 'pasta', 'time', 'excellent', 'pasta', 'going', 'gluten', 'free', 'years', 'ago'] : Prediction --> 1
['music', 'mark', 'snow', 'possibly', 'best', 'score', 'heard'] : Prediction --> 1
['course', 'acting', 'blah'] : Prediction --> 0
['allowing', 'poor', 'production', 'values', 'time', 'format', 'kind', 'mini', 'series', 'baaaaaad'] : Prediction --> 0
['lame', 'best', 'way', 'describe'] : Prediction --> 0
['received', 'white', 'colored', 'battery', 'goes', 'dead', 'couple', 'hoursthe', 'original', 'week', 'lasts', 'longer', 'thereplacement'] : Prediction --> 1
['leave', 'theater', 'wanting', 'dance', 'stars'] : Prediction --> 0
['thing', 'disappoint', 'infra', 'red', 'port', 'irda'] : Prediction --> 1
['sashimi', 'poor', 'quality', 'soggy', 'tasteless'] : Prediction --> 0
['fs', 'restaurant', 'wonderful', 'breakfast', 'lunch'] : Prediction 

**25. Determine the accuracy for the test dataset, X_test/sample prediction, and train dataset**

In [205]:
# Determine accuracy on the test set
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {accuracy * 100:.2f}%")

# Optionally, calculate accuracy on the training set
y_train_pred = pipe_countvect.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Training Set Accuracy: {train_accuracy * 100:.2f}%")

Test Set Accuracy: 78.36%
Training Set Accuracy: 98.54%
