<a href="https://colab.research.google.com/github/TuanMinhLuu/ADS-Submission-Final-Report/blob/main/Sentimet_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# Import necessary libraries and install missing packages
!pip install fasttext pandas numpy textblob
!pip install vaderSentiment
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk

# Download necessary NLTK data if not already downloaded
nltk.download('stopwords')
nltk.download('wordnet')

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296187 sha256=89029f26c99245e1aafe5bfe3f902f1a0ec777aca9510ac8b21c0ffdb7e6ec11
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Import the data

In [17]:
# Mount to my Google Drive to access files
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
# Define the file path to the dataset containing news
file_path = '/content/drive/My Drive/Sentiment_Analysis/bbc_news.csv'

# Load the dataset
df = pd.read_csv(file_path)
print(df.head())

                                               title  \
0  Ukraine: Angry Zelensky vows to punish Russian...   
1  War in Ukraine: Taking cover in a town under a...   
2         Ukraine war 'catastrophic for global food'   
3  Manchester Arena bombing: Saffie Roussos's par...   
4  Ukraine conflict: Oil price soars to highest l...   

                         pubDate  \
0  Mon, 07 Mar 2022 08:01:56 GMT   
1  Sun, 06 Mar 2022 22:49:58 GMT   
2  Mon, 07 Mar 2022 00:14:42 GMT   
3  Mon, 07 Mar 2022 00:05:40 GMT   
4  Mon, 07 Mar 2022 08:15:53 GMT   

                                               guid  \
0  https://www.bbc.co.uk/news/world-europe-60638042   
1  https://www.bbc.co.uk/news/world-europe-60641873   
2      https://www.bbc.co.uk/news/business-60623941   
3            https://www.bbc.co.uk/news/uk-60579079   
4      https://www.bbc.co.uk/news/business-60642786   

                                                link  \
0  https://www.bbc.co.uk/news/world-europe-606380...   
1  

In [19]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Function to clean and preprocess text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.strip()

# Apply basic cleaning to the title and description columns
df['title_cleaned'] = df['title'].apply(clean_text)
df['description_cleaned'] = df['description'].apply(clean_text)

# Define stop words and a function to remove them
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stop_words])

# Apply stopwords removal to the cleaned columns
df['title_cleaned'] = df['title_cleaned'].apply(remove_stopwords)
df['description_cleaned'] = df['description_cleaned'].apply(remove_stopwords)

# Initialize the WordNetLemmatizer and define a function to lemmatize text
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Apply lemmatization to the cleaned columns
df['title_cleaned'] = df['title_cleaned'].apply(lemmatize_text)
df['description_cleaned'] = df['description_cleaned'].apply(lemmatize_text)

# Filter out rows where the cleaned text is too short
df = df[df['title_cleaned'].apply(lambda x: len(x.split()) > 2)]
df = df[df['description_cleaned'].apply(lambda x: len(x.split()) > 2)]

# Remove rows where the cleaned text length is below a reasonable threshold
df = df[(df['title_cleaned'].str.len() > 10) & (df['description_cleaned'].str.len() > 10)]

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to classify sentiment using VADER based on combined title and description content
def get_vader_combined_sentiment(title, description):
    title_score = analyzer.polarity_scores(title)['compound']
    description_score = analyzer.polarity_scores(description)['compound']
    avg_score = (title_score + description_score) / 2

    if avg_score >= 0.05:
        return '__label__positive'
    elif avg_score <= -0.05:
        return '__label__negative'
    else:
        return '__label__neutral'

# Apply VADER sentiment classification to the cleaned title and description columns
df['combined_sentiment'] = df.apply(lambda x: get_vader_combined_sentiment(x['title_cleaned'], x['description_cleaned']), axis=1)

# Format the cleaned data for FastText
df['combined_fasttext_format_cleaned'] = df['combined_sentiment'] + " " + df['description_cleaned']

# Save the cleaned dataset to a file
output_file = '/content/drive/My Drive/Sentiment_Analysis/bbc_news_combined_fasttext_cleaned.txt'
df['combined_fasttext_format_cleaned'].to_csv(output_file, index=False, header=False)

# Display the first few rows of the cleaned and formatted data
print("Cleaned Combined Sentiment Data:")
print(df[['title_cleaned', 'description_cleaned', 'combined_sentiment', 'combined_fasttext_format_cleaned']].head())

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Cleaned Combined Sentiment Data:
                                       title_cleaned  \
0  ukraine angry zelensky vow punish russian atro...   
1               war ukraine taking cover town attack   
2               ukraine war catastrophic global food   
3  manchester arena bombing saffie roussoss paren...   
4  ukraine conflict oil price soar highest level ...   

                                 description_cleaned combined_sentiment  \
0  ukrainian president say country forgive forget...  __label__negative   
1  jeremy bowen frontline irpin resident came rus...  __label__negative   
2  one world biggest fertiliser firm say conflict...  __label__negative   
3  parent manchester arena bombing youngest victi...  __label__negative   
4  consumer feeling impact higher energy cost fue...   __label__neutral   

                    combined_fasttext_format_cleaned  
0  __label__negative ukrainian president say coun...  
1  __label__negative jeremy bowen frontline irpin...  
2  __label__ne

In [20]:
# Display the distribution of sentiment classes
print("Class Distribution in Combined Sentiment Data:")
print(df['combined_sentiment'].value_counts())

Class Distribution in Combined Sentiment Data:
combined_sentiment
__label__negative    17801
__label__positive    14401
__label__neutral      7077
Name: count, dtype: int64


In [21]:
from sklearn.model_selection import train_test_split

# Split the combined data into training (80%) and testing (20%) sets.
train_data, test_data = train_test_split(df['combined_fasttext_format_cleaned'], test_size=0.2, random_state=42)

# Save the training and testing datasets to separate files.
train_data.to_csv('/content/drive/My Drive/Sentiment_Analysis/bbc_news_train_combined.txt', index=False, header=False)
test_data.to_csv('/content/drive/My Drive/Sentiment_Analysis/bbc_news_test_combined.txt', index=False, header=False)

# Display the sizes of the training and testing sets.
print(f"Training set size: {len(train_data)}")
print(f"Testing set size: {len(test_data)}")

Training set size: 31423
Testing set size: 7856


In [22]:
import fasttext

# Train the FastText model using the training dataset.
model = fasttext.train_supervised(
    input='/content/drive/My Drive/Sentiment_Analysis/bbc_news_train_combined.txt',
    lr=1.0,
    epoch=25,
    wordNgrams=2,
    verbose=2,
    minCount=1
)

# Save the trained model to a file.
model.save_model('/content/drive/My Drive/Sentiment_Analysis/bbc_news_model_combined.bin')
print("Model trained and saved.")

Model trained and saved.


In [23]:
# Evaluate the model on the test dataset.
result = model.test('/content/drive/My Drive/Sentiment_Analysis/bbc_news_test_combined.txt')

# Display evaluation metrics.
print(f"Number of examples: {result[0]}")
print(f"Precision: {result[1]}")
print(f"Recall: {result[2]}")

Number of examples: 7856
Precision: 0.7368890020366599
Recall: 0.7368890020366599


In [24]:
# Import libraries for evaluation
from sklearn.metrics import confusion_matrix, classification_report
import pandas as pd

# Load the test dataset with labels
test_file_path = '/content/drive/My Drive/Sentiment_Analysis/bbc_news_test_combined.txt'

# Read the test data
test_data = pd.read_csv(test_file_path, header=None, names=['text'])

# Extract labels and separate text
test_data['label'] = test_data['text'].apply(lambda x: x.split()[0])
test_data['text_only'] = test_data['text'].apply(lambda x: ' '.join(x.split()[1:]))

# Get predictions for the test dataset
predictions = [model.predict(row)[0][0] for row in test_data['text_only']]

# Calculate and display the confusion matrix
conf_matrix = confusion_matrix(test_data['label'], predictions, labels=['__label__negative', '__label__neutral', '__label__positive'])
print("Confusion Matrix:")
print(conf_matrix)

# Display the classification report including precision, recall, and F1-score
print("\nClassification Report:")
print(classification_report(test_data['label'], predictions, target_names=['Negative', 'Neutral', 'Positive']))

Confusion Matrix:
[[2917  252  385]
 [ 449  619  367]
 [ 369  245 2253]]

Classification Report:
              precision    recall  f1-score   support

    Negative       0.78      0.82      0.80      3554
     Neutral       0.55      0.43      0.49      1435
    Positive       0.75      0.79      0.77      2867

    accuracy                           0.74      7856
   macro avg       0.70      0.68      0.68      7856
weighted avg       0.73      0.74      0.73      7856



In [25]:
import fasttext
from sklearn.metrics import classification_report

# Define hyperparameter grid for tuning
learning_rates = [0.01, 0.05, 0.1]
epochs = [25, 50]
word_ngrams = [1, 2]
dimensions = [50, 100]

# Variables to track the best model and its performance
best_precision = 0
best_model = None
best_params = {}

# Iterate over all combinations of hyperparameters
for lr in learning_rates:
    for epoch in epochs:
        for ngram in word_ngrams:
            for dim in dimensions:
                print(f"Training model with lr={lr}, epoch={epoch}, wordNgrams={ngram}, dim={dim}")

                try:
                    # Train the FastText model with the current hyperparameters
                    model = fasttext.train_supervised(
                        input='/content/drive/My Drive/Sentiment_Analysis/bbc_news_train_combined.txt',
                        lr=lr,
                        epoch=epoch,
                        wordNgrams=ngram,
                        dim=dim,
                        verbose=2,
                        minCount=1
                    )

                    # Get predictions for the test dataset
                    predictions = [model.predict(row)[0][0] for row in test_data['text_only']]

                    # Calculate the precision for the current model
                    precision = classification_report(
                        test_data['label'], predictions, output_dict=True
                    )['weighted avg']['precision']

                    print(f"Precision: {precision}")

                    # Update the best model if the current one outperforms previous ones
                    if precision > best_precision:
                        best_precision = precision
                        best_model = model
                        best_params = {'lr': lr, 'epoch': epoch, 'wordNgrams': ngram, 'dim': dim}
                        print(f"New best model found with precision: {best_precision} and parameters: {best_params}")
                except RuntimeError as error:
                    print(f"Error encountered with parameters: lr={lr}, epoch={epoch}, wordNgrams={ngram}, dim={dim}")
                    print(f"RuntimeError: {error}")
                    continue

# Save the best model if found
if best_model:
    best_model.save_model('/content/drive/My Drive/Sentiment_Analysis/bbc_news_best_tuned_model.bin')
    print(f"Best model saved with parameters: {best_params} and precision: {best_precision}")
else:
    print("No suitable model was found during hyperparameter tuning.")

Training model with lr=0.01, epoch=25, wordNgrams=1, dim=50
Error encountered with parameters: lr=0.01, epoch=25, wordNgrams=1, dim=50
RuntimeError: Encountered NaN.
Training model with lr=0.01, epoch=25, wordNgrams=1, dim=100
Precision: 0.7247297954101589
New best model found with precision: 0.7247297954101589 and parameters: {'lr': 0.01, 'epoch': 25, 'wordNgrams': 1, 'dim': 100}
Training model with lr=0.01, epoch=25, wordNgrams=2, dim=50
Precision: 0.7050998341140617
Training model with lr=0.01, epoch=25, wordNgrams=2, dim=100
Precision: 0.7058740050282146
Training model with lr=0.01, epoch=50, wordNgrams=1, dim=50
Precision: 0.7230775265966417
Training model with lr=0.01, epoch=50, wordNgrams=1, dim=100
Precision: 0.7230856553265154
Training model with lr=0.01, epoch=50, wordNgrams=2, dim=50
Precision: 0.7326003520497344
New best model found with precision: 0.7326003520497344 and parameters: {'lr': 0.01, 'epoch': 50, 'wordNgrams': 2, 'dim': 50}
Training model with lr=0.01, epoch=50,

In [26]:
import pandas as pd
from sklearn.utils import resample

# Separate the classes into different dataframes
df_negative = df[df['combined_sentiment'] == '__label__negative']
df_positive = df[df['combined_sentiment'] == '__label__positive']
df_neutral = df[df['combined_sentiment'] == '__label__neutral']

# Determine the target size for oversampling (same as the positive class size)
target_size = len(df_positive)

# Perform random oversampling on the neutral class to reach the target size
df_neutral_upsampled = resample(df_neutral,
                                replace=True,
                                n_samples=target_size,
                                random_state=42)

# Combine the oversampled neutral class with the original positive and negative classes
df_balanced = pd.concat([df_negative, df_positive, df_neutral_upsampled])
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the new class distribution
print("Class Distribution After Oversampling Neutral Class:")
print(df_balanced['combined_sentiment'].value_counts())

Class Distribution After Oversampling Neutral Class:
combined_sentiment
__label__negative    17801
__label__positive    14401
__label__neutral     14401
Name: count, dtype: int64


In [27]:
# Save the balanced dataset to a file
balanced_output_file = '/content/drive/My Drive/Sentiment_Analysis/bbc_news_balanced_combined.txt'
df_balanced['combined_fasttext_format_cleaned'].to_csv(balanced_output_file, index=False, header=False)

In [28]:
# Train the FastText model using the balanced dataset
balanced_model = fasttext.train_supervised(
    input=balanced_output_file,
    lr=1.0,
    epoch=25,
    wordNgrams=2,
    verbose=2,
    minCount=1
)

# Save the balanced-trained model to a file
balanced_model.save_model('/content/drive/My Drive/Sentiment_Analysis/bbc_news_balanced_model.bin')
print("Balanced model trained and saved.")

Balanced model trained and saved.


In [29]:
# Generate predictions using the balanced model on the test dataset
test_data['balanced_predictions'] = test_data['text_only'].apply(lambda x: balanced_model.predict(x)[0][0])

# Generate the classification report with the predictions
from sklearn.metrics import classification_report

# Generate the classification
report = classification_report(test_data['label'], test_data['balanced_predictions'], target_names=['Negative', 'Neutral', 'Positive'], output_dict=True)

# Extract the weighted precision and recall
overall_precision = report['weighted avg']['precision']
overall_recall = report['weighted avg']['recall']

# Display the overall precision and recall
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")

Overall Precision: 0.9810387925588343
Overall Recall: 0.9810336048879837


In [30]:
from sklearn.metrics import confusion_matrix, classification_report

# Get predictions for the test set
test_data['balanced_predictions'] = test_data['text_only'].apply(lambda x: balanced_model.predict(x)[0][0])

# Calculate and display the confusion matrix and classification report
conf_matrix = confusion_matrix(test_data['label'], test_data['balanced_predictions'],
                               labels=['__label__negative', '__label__neutral', '__label__positive'])
print("Confusion Matrix After Balancing:")
print(conf_matrix)

# Classification report
print("\nClassification Report After Balancing:")
print(classification_report(test_data['label'], test_data['balanced_predictions'],
                            target_names=['Negative', 'Neutral', 'Positive']))

Confusion Matrix After Balancing:
[[3539   13    2]
 [  63 1319   53]
 [   6   12 2849]]

Classification Report After Balancing:
              precision    recall  f1-score   support

    Negative       0.98      1.00      0.99      3554
     Neutral       0.98      0.92      0.95      1435
    Positive       0.98      0.99      0.99      2867

    accuracy                           0.98      7856
   macro avg       0.98      0.97      0.97      7856
weighted avg       0.98      0.98      0.98      7856

