#### Week 10 Milestone 4 Author: Rex Gayas Course & Section: DSC360-T301 Data Mining: Text Analytics an (2243-1) Date: 18 FEB 2024

#### Data Loading and Preprocessing

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Define a function to clean the text
def clean_text(text):
    # If text is NaN (float), return an empty string
    if pd.isna(text):
        return ""
    # Convert text to lowercase
    text = text.lower()
    # Remove URLs, hashtags, mentions, and stopwords
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    # Join the tokens back into a string
    return ' '.join(tokens)

import nltk


# Load Twitter dataset
twitter_file_path = 'D:/ALPHA\Dynamic Folder/Bellevue/Winter 2023/Data Mining/Project/Datasets/Kaggle/twitter_sentiment_data.csv'  
twitter_df = pd.read_csv(twitter_file_path)
twitter_df['cleaned_text'] = twitter_df['message'].apply(clean_text)

# Load Reddit dataset and sample 10% of the data
reddit_file_path = 'D:/ALPHA/Dynamic Folder/Bellevue/Winter 2023/Data Mining/Project/Datasets/Kaggle/reddit_opinion_climate_change.csv' 
reddit_df = pd.read_csv(reddit_file_path)
reddit_sample_df = reddit_df.sample(frac=0.1, random_state=42)  # Sample 10% of the data
reddit_sample_df['cleaned_text'] = reddit_sample_df['self_text'].apply(clean_text)

# Display the first few rows to confirm successful loading and cleaning
print(twitter_df.head())
print(reddit_sample_df.head())


   sentiment                                            message  \
0         -1  @tiniebeany climate change is an interesting h...   
1          1  RT @NatGeoChannel: Watch #BeforeTheFlood right...   
2          1  Fabulous! Leonardo #DiCaprio's film on #climat...   
3          1  RT @Mick_Fanning: Just watched this amazing do...   
4          2  RT @cnalive: Pranita Biswasi, a Lutheran from ...   

              tweetid                                       cleaned_text  
0  792927353886371840  climate change interesting hustle global warmi...  
1  793124211518832641  rt : watch beforetheflood right , travels worl...  
2  793124402388832256  fabulous ! leonardo dicaprio 's film climate c...  
3  793124635873275904  rt : watched amazing documentary leonardodicap...  
4  793125156185137153  rt : pranita biswasi , lutheran odisha , gives...  
       comment_id  score                                          self_text  \
268628    k14k1rf     16  They are taking the moral stance without p

In [4]:
import requests
import pandas as pd

# NewsAPI key
api_key = '586380ef164f4b9287f2e14a286cc9ef'

# Define the endpoint and parameters
url = 'https://newsapi.org/v2/everything'
params = {
    'q': 'climate change',  # Query for articles mentioning climate change
    'apiKey': api_key,
    'pageSize': 20,  # Number of articles to return
    'page': 1  # Page number 
}

# Make the request to the NewsAPI
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    articles = response.json()['articles']
    # Create a DataFrame with the relevant information
    df_articles = pd.DataFrame(articles)
    print(df_articles.head())  # Display the first few entries
else:
    print(f"Error: {response.status_code}")




                                     source             author  \
0  {'id': 'the-verge', 'name': 'The Verge'}    Jess Weatherbed   
1  {'id': 'the-verge', 'name': 'The Verge'}      Justine Calma   
2  {'id': 'the-verge', 'name': 'The Verge'}      Justine Calma   
3          {'id': 'wired', 'name': 'Wired'}  Stephen Armstrong   
4          {'id': 'wired', 'name': 'Wired'}      Matt Reynolds   

                                               title  \
0                  Paris votes to crack down on SUVs   
1  The threat of extinction is getting worse for ...   
2  NASA’s new mission will study microscopic plan...   
3  Climate Finance Is Targeting the Wrong Industries   
4  Inside the Beef Industry’s Campaign to Influen...   

                                         description  \
0  Parisians have voted to increase parking charg...   
1  A new United Nations report gives the fullest ...   
2  A new NASA mission will study microscopic plan...   
3  Roughly half of the world’s emissions c

In [5]:
import pandas as pd

# Convert the list of articles into a DataFrame
news_df = pd.DataFrame(articles)

# Check the columns and preview the DataFrame 
print(news_df.columns)
print(news_df.head())


Index(['source', 'author', 'title', 'description', 'url', 'urlToImage',
       'publishedAt', 'content'],
      dtype='object')
                                     source             author  \
0  {'id': 'the-verge', 'name': 'The Verge'}    Jess Weatherbed   
1  {'id': 'the-verge', 'name': 'The Verge'}      Justine Calma   
2  {'id': 'the-verge', 'name': 'The Verge'}      Justine Calma   
3          {'id': 'wired', 'name': 'Wired'}  Stephen Armstrong   
4          {'id': 'wired', 'name': 'Wired'}      Matt Reynolds   

                                               title  \
0                  Paris votes to crack down on SUVs   
1  The threat of extinction is getting worse for ...   
2  NASA’s new mission will study microscopic plan...   
3  Climate Finance Is Targeting the Wrong Industries   
4  Inside the Beef Industry’s Campaign to Influen...   

                                         description  \
0  Parisians have voted to increase parking charg...   
1  A new United Nations re

#### Feature Engineering

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Start the vectorizer with reasonable parameters to handle large text data
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=10000)

# Apply the vectorizer to each dataset
twitter_tfidf = tfidf_vectorizer.fit_transform(twitter_df['cleaned_text'])
reddit_tfidf = tfidf_vectorizer.transform(reddit_sample_df['cleaned_text'])
news_tfidf = tfidf_vectorizer.transform(news_df['content'])  

# Dimensionality reduction 
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500, random_state=42)
twitter_reduced = svd.fit_transform(twitter_tfidf)
reddit_reduced = svd.transform(reddit_tfidf)
news_reduced = svd.transform(news_tfidf)


In [7]:
# Check shape of transformation after applying TF-IDF vectorization
print("Twitter TF-IDF features shape:", twitter_tfidf.shape)
print("Reddit TF-IDF features shape:", reddit_tfidf.shape)
print("News TF-IDF features shape:", news_tfidf.shape)

# Check shape of transformation after applying dimensionality reduction
print("Twitter reduced features shape:", twitter_reduced.shape)
print("Reddit reduced features shape:", reddit_reduced.shape)
print("News reduced features shape:", news_reduced.shape)


Twitter TF-IDF features shape: (43943, 10000)
Reddit TF-IDF features shape: (30374, 10000)
News TF-IDF features shape: (20, 10000)
Twitter reduced features shape: (43943, 500)
Reddit reduced features shape: (30374, 500)
News reduced features shape: (20, 500)


#### Data Labeling

In [8]:
# Redefine the keywords for labeling
believer_keywords = ["support", "agree", "pro"]
skeptic_keywords = ["doubt", "deny", "skeptic"]
undecided_keywords = ["unsure", "question", "undecided"]

# Define the labeling function
def label_sentiment(text):
    text = text.lower()
    if any(keyword in text for keyword in believer_keywords):
        return 'believer'
    elif any(keyword in text for keyword in skeptic_keywords):
        return 'skeptic'
    elif any(keyword in text for keyword in undecided_keywords):
        return 'undecided'
    else:
        return 'unknown'  # For tweets that do not contain any of the keywords

# Load the Twitter dataset
twitter_df = pd.read_csv('D:/ALPHA/Dynamic Folder/Bellevue/Winter 2023/Data Mining/Project/Datasets/Kaggle/twitter_sentiment_data.csv')

# Apply the labeling function to the 'message' column
twitter_df['sentiment_label'] = twitter_df['message'].apply(label_sentiment)

# Display the distribution of the labeled sentiments
twitter_df['sentiment_label'].value_counts()



sentiment_label
unknown      37980
believer      4698
skeptic        929
undecided      336
Name: count, dtype: int64

#### Model Training using Logistic Regression & Subsequent Evaluation

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Filter out 'unknown' labeled data
filtered_df = twitter_df[twitter_df['sentiment_label'] != 'unknown']

# Vectorize the text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(filtered_df['message'])

# Labels
y = filtered_df['sentiment_label']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Predict on the test set
y_pred = lr_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report


(0.9337803855825649,
 '              precision    recall  f1-score   support\n\n    believer       0.93      0.99      0.96       957\n     skeptic       0.91      0.73      0.81       176\n   undecided       0.98      0.68      0.80        60\n\n    accuracy                           0.93      1193\n   macro avg       0.94      0.80      0.86      1193\nweighted avg       0.93      0.93      0.93      1193\n')

#### Iterative Refinement

In [11]:
# Filter the dataset to include only 'unknown' labeled tweets for prediction
unknown_df = twitter_df.loc[twitter_df['sentiment_label'] == 'unknown']

# Vectorize the 'unknown' tweets using the fitted TF-IDF vectorizer
X_unknown_tfidf = tfidf_vectorizer.transform(unknown_df['message'])

# Predict sentiments for the 'unknown' tweets
y_unknown_pred = lr_model.predict(X_unknown_tfidf)

# Add the predicted labels back to the unknown_df DataFrame 
unknown_df.loc[:, 'sentiment_label'] = y_unknown_pred

# Combine the originally labeled data with the newly labeled data
augmented_df = pd.concat([filtered_df, unknown_df])

# Re-vectorize the text of the augmented dataset
X_augmented_tfidf = tfidf_vectorizer.transform(augmented_df['message'])
y_augmented = augmented_df['sentiment_label']

# Re-split the augmented dataset into training and testing sets
X_train_augmented, X_test_augmented, y_train_augmented, y_test_augmented = train_test_split(
    X_augmented_tfidf, y_augmented, test_size=0.2, random_state=42)

# Re-train the Logistic Regression model on the augmented dataset
lr_model.fit(X_train_augmented, y_train_augmented)

# Predict on the augmented test set and evaluate the model again
y_pred_augmented = lr_model.predict(X_test_augmented)
accuracy_augmented = accuracy_score(y_test_augmented, y_pred_augmented)
report_augmented = classification_report(y_test_augmented, y_pred_augmented)

accuracy_augmented, report_augmented


(0.9854363408806462,
 '              precision    recall  f1-score   support\n\n    believer       0.99      1.00      0.99      8531\n     skeptic       0.91      0.56      0.69       190\n   undecided       1.00      0.50      0.67        68\n\n    accuracy                           0.99      8789\n   macro avg       0.97      0.69      0.78      8789\nweighted avg       0.98      0.99      0.98      8789\n')