## **Step 1: Import Required Libraries**

In [36]:
import pandas as pd
import textblob
import plotly.express as px
from textblob import TextBlob
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# **Step 2: Load and Preprocess Data**

In [37]:
# Load the training and validation datasets
train_df = pd.read_csv('twitter_training.csv')
validation_df = pd.read_csv('twitter_validation.csv')

# Sample of the training dataset
train_df.head()

# Sample of the validation dataset
validation_df.head()

Unnamed: 0,Sr no,Company,sentiment_labels,tweets
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


# **Step 3: Preprocess the Data**

In [38]:
# Clean and preprocess the tweet text in both datasets
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if isinstance(text, str):  # Check if it's a valid string
        # Remove special characters and links
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'[^A-Za-z0-9]+', ' ', text)

        # Convert text to lowercase
        text = text.lower()

        # Tokenize the text
        tokens = text.split()

        # Remove stopwords
        tokens = [word for word in tokens if word not in stop_words]

        return ' '.join(tokens)
    else:
        return ''

# Apply preprocessing to the training dataset
train_df['Cleaned Text'] = train_df['tweets'].apply(preprocess_text)

# Apply preprocessing to the validation dataset
validation_df['Cleaned Text'] = validation_df['tweets'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# **Step 4: Sentiment Analysis**

In [39]:
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity == 0:
        return 'Neutral'
    else:
        return 'Negative'

# Apply sentiment analysis to the training dataset
train_df['Sentiment'] = train_df['Cleaned Text'].apply(get_sentiment)

# Apply sentiment analysis to the validation dataset
validation_df['Sentiment'] = validation_df['Cleaned Text'].apply(get_sentiment)

# **Step 5: Visualize the Sentiment**

In [40]:
# Training dataset sentiment distribution
train_sentiment_counts = train_df['Sentiment'].value_counts().reset_index()
train_sentiment_counts.columns = ['Sentiment', 'Count']

fig = px.bar(train_sentiment_counts, x='Sentiment', y='Count', title='Training Dataset Sentiment Distribution')
fig.show()

# Validation dataset sentiment distribution
validation_sentiment_counts = validation_df['Sentiment'].value_counts().reset_index()
validation_sentiment_counts.columns = ['Sentiment', 'Count']

fig = px.bar(validation_sentiment_counts, x='Sentiment', y='Count', title='Validation Dataset Sentiment Distribution')
fig.show()

# **Step 6: Machine Learning - Sentiment Prediction**

In [41]:
  # Split the data into train and test sets
  X_train, X_test, y_train, y_test = train_test_split(train_df['Cleaned Text'], train_df['sentiment_labels'], test_size=0.2, random_state=42)

  # Vectorize the text data using TF-IDF
  tfidf_vectorizer = TfidfVectorizer(max_features=5000)
  X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
  X_test_tfidf = tfidf_vectorizer.transform(X_test)

  # Train a Multinomial Naive Bayes classifier
  clf = MultinomialNB()
  clf.fit(X_train_tfidf, y_train)

  # Predict on the test set
  y_pred = clf.predict(X_test_tfidf)

  # Evaluate the model
  accuracy = accuracy_score(y_test, y_pred)
  classification_rep = classification_report(y_test, y_pred)

  print("Machine Learning Model Evaluation:")
  print(f'Accuracy: {accuracy}')
  print(classification_rep)

Machine Learning Model Evaluation:
Accuracy: 0.642364597978175
              precision    recall  f1-score   support

  Irrelevant       0.73      0.36      0.48      2592
    Negative       0.62      0.81      0.71      4519
     Neutral       0.66      0.53      0.59      3596
    Positive       0.63      0.73      0.68      4230

    accuracy                           0.64     14937
   macro avg       0.66      0.61      0.61     14937
weighted avg       0.65      0.64      0.63     14937



# **Step 7: Predict Sentiment on Validation Dataset**

In [42]:
# Vectorize the validation dataset
validation_tfidf = tfidf_vectorizer.transform(validation_df['Cleaned Text'])

# Predict sentiment on the validation dataset
validation_predictions = clf.predict(validation_tfidf)

# Add predicted sentiment to the validation dataset
validation_df['Predicted Sentiment'] = validation_predictions

# Save the validation dataset with predictions to a CSV file
validation_df.to_csv('twitter_validation_with_predictions.csv', index=False)

# Reduce the dimensionality of the validation dataset using PCA
pca = PCA(n_components=3)
validation_tfidf_pca = pca.fit_transform(validation_tfidf.toarray())

# Create a DataFrame for the reduced data
reduced_df = pd.DataFrame(data=validation_tfidf_pca, columns=['Component 1', 'Component 2', 'Component 3'])

# Add the predicted sentiment labels to the reduced DataFrame
reduced_df['Predicted Sentiment'] = validation_predictions

# Create a 3D scatter plot
fig = px.scatter_3d(
    reduced_df,
    x='Component 1',
    y='Component 2',
    z='Component 3',
    color='Predicted Sentiment',
    title='3D Scatter Plot of Predicted Sentiments (PCA Reduced)',
)

# Show the plot
fig.show()

# **Step 8: Comparison for between the predictions:**

In [43]:
# Load the validation dataset with predictions
validation_df = pd.read_csv('twitter_validation_with_predictions.csv')

# Group the data by sentiment label for both previous and current predictions
previous_sentiment_counts = validation_df['Sentiment'].value_counts().reset_index()
previous_sentiment_counts.columns = ['Sentiment', 'Previous Count']

current_sentiment_counts = validation_df['Predicted Sentiment'].value_counts().reset_index()
current_sentiment_counts.columns = ['Sentiment', 'Current Count']

# Merge the two dataframes to compare counts
comparison_df = pd.merge(previous_sentiment_counts, current_sentiment_counts, on='Sentiment', how='outer')

# Fill NaN values with 0 (if any)
comparison_df = comparison_df.fillna(0)

# Create a bar chart for comparison
fig = px.bar(
    comparison_df,
    x='Sentiment',
    y=['Previous Count', 'Current Count'],
    barmode='group',
    title='Comparison of Previous and Current Predictions',
    labels={'value': 'Count'},
)

# Show the plot
fig.show()