<a href="https://colab.research.google.com/github/Veranzi/Data_Science/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

IMPORT LIBRARIES

In [None]:
pip install nrclex


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nrclex import NRCLex

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from scipy.stats import norm

import warnings
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv("/content/drive/MyDrive/sentimentdataset.csv")

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
df.shape

DATA CLEANING

In [None]:
df

In [None]:
df1 = df['Sentiment']
df1


REMOVE COLUMNS NOT NEEDED

In [None]:
df.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis=1, inplace=True)

In [None]:
df.isna().sum()

DO AWAY WITH GAPS IN "OBJECT DATATYPES"

In [None]:
df = df.select_dtypes(include=[object])

list(df.columns)

To remove leading and trailing whitespaces from string columns

In [None]:

df[df.columns] = df[df.columns].apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df


Handling duplicates

In [None]:
df.duplicated()

The number of duplicated rows

In [None]:
len_duplicated = len(df[df.duplicated(subset=['Text', 'Timestamp', 'Platform'])])


Count unique rows based on 'Text' and 'Platform' columns only

In [None]:

unique_rows = len(df) - len(df[df.duplicated(subset=['Text', 'Timestamp','Platform'])])
unique_rows

Rename pudlicated entries and save to a new dataframe

In [None]:

no_duplicated = df.drop_duplicates(subset=['Text','Timestamp', 'Platform'], keep='first')
no_duplicated

Find unique values in platform

In [None]:
no_duplicated['Platform'].unique()


In [None]:
twitter = no_duplicated[no_duplicated["Platform"] == 'Twitter']
instagram = no_duplicated[no_duplicated["Platform"] == 'Instagram']
facebook = no_duplicated[no_duplicated["Platform"] == 'Facebook']

In [None]:
no_duplicated

EDA

Data based on platform

In [None]:
# Counts from each platform DataFrame
counts = [twitter['Text'].count(), instagram['Text'].count(), facebook['Text'].count()]

# Platform names for x-axis labels
platforms = ['Twitter', 'Instagram', 'Facebook']

# Colors for bars
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']

# Plotting
plt.bar(platforms, counts, color=colors)
plt.ylabel('Counts')
plt.xlabel('Platforms')
plt.title('Counts of Text Entries by Platform')
plt.show()

In [None]:
df.isnull().sum()

In [None]:
plt.plot(twitter.groupby(['Timestamp']).Platform.count(), color= "#6699CC", marker ='o', label='Twitter')
plt.plot(instagram.groupby(['Timestamp']).Platform.count(), color= "#CC6677", marker ='o', label='Instagram')
plt.plot(facebook.groupby(['Timestamp']).Platform.count(), color= "#997700", marker ='o', label='Facebook')
plt.xlabel('Timestamp')
plt.ylabel('Counts')
plt.legend()

In [None]:
# Counts from each platform DataFrame
counts = [twitter['Sentiment'].count(), instagram['Sentiment'].count(), facebook['Sentiment'].count()]

# Platform names for x-axis labels
platforms = ['Twitter', 'Instagram', 'Facebook']

# Colors for bars
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']

# Plotting
plt.bar(platforms, counts, color=colors)
plt.ylabel('Counts')
plt.xlabel('Platforms')
plt.title('Counts of Sentiment Entries by Platform')
plt.show()

In [None]:

sns.displot(data=no_duplicated[['Platform','Country']], x="Platform", hue="Country",
            palette = ["#6699CC", '#CC6677', '#997700'],
            alpha=0.3,
           # multiple='dodge',
           # multiple='stack',
            kde=True)
plt.show()

In [None]:
no_duplicated['Sentiment'].value_counts().nlargest(10).plot(kind='bar')
plt.title('Top 10 Sentiments based on Text')
plt.xlabel('Sentiment')
plt.ylabel('text')
plt.show()

In [None]:
no_duplicated['Platform'].value_counts().plot(kind='pie', autopct='%1.5f%%')
plt.title('Percentages of Platforms')
plt.legend()
plt.show()

In [None]:
no_duplicated['Country'].value_counts().nlargest(10).plot(kind='bar')
plt.title('Based on Country')
plt.xlabel('Country')
plt.ylabel('Count')
plt.show()

In [None]:
nltk.download('punkt')

In [None]:
import nltk
for i in no_duplicated.index:
    text = no_duplicated.loc[i, "Text"].lower()
    emotion = NRCLex(text)
    positive_score = emotion.affect_frequencies['positive']
    negative_score = emotion.affect_frequencies['negative']
    #print("positive_score", positive_score, "negative_score", negative_score)
    if positive_score > negative_score:
        no_duplicated.loc[i, "Sentiment_NRC"] = "Positive"
    elif positive_score == negative_score:
        no_duplicated.loc[i, "Sentiment_NRC"] = "Neutral"
    else:
        no_duplicated.loc[i, "Sentiment_NRC"] = "Negative"

In [None]:
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

Extract unique sentiments and map of sentiment to unique label.
Add a new column with numeric labels based on the mapping




In [None]:

unique_sentiments = no_duplicated['Sentiment'].unique()

sentiment_label_map = {sentiment: idx for idx, sentiment in enumerate(unique_sentiments)}


no_duplicated['Label'] = no_duplicated['Sentiment'].map(sentiment_label_map)

# Print
print(no_duplicated[['Text', 'Sentiment', 'Label']])

Split data into train and test sets

In [None]:

X_train, X_test, y_train, y_test = train_test_split(no_duplicated['Text'], no_duplicated['Label'], test_size=0.2, random_state=42)

Create TF-IDF vectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train_tfidf, y_train)
tfidf_transformer = TfidfVectorizer()

Predict

In [None]:
y_pred = dt_model.predict(X_test_tfidf)
y_pred

Dataframe prediction

In [None]:

df = pd.DataFrame({'Predicted Values': y_pred})


df

In [None]:
report = classification_report(y_test, y_pred, zero_division=1)

y_pred and y_true are the predicted and true values, respectively, calculate and print accuracy.

In [None]:
from sklearn.metrics import accuracy_score

y_pred = dt_model.predict(X_test_tfidf)
y_true = y_test
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy:.2f}")


In [None]:
def parse_classification_report(report):
    lines = report.split("\n")
    lines = [line.strip() for line in lines if line.strip
            ()]
    class_names = []
    metrics = []
    for line in lines[2:]:
        parts = line.split()
        if len(parts) > 1 and parts[0].strip() != 'accuracy':
            class_name = ' '.join(parts[:-4])
            class_names.append(class_name)
            class_metrics = [float(part) for part in parts[-4:] if part != 'support']
            metrics.append(class_metrics)

    return class_names, metrics

class_names, metrics = parse_classification_report(report)

In [None]:
metric_names = ['precision', 'recall', 'f1-score']
metric_dict = {metric: [m[i] for m in metrics] for i, metric in enumerate(metric_names)}

Plotting the classification report metrics

In [None]:
for metric_name, values in metric_dict.items():
    plt.barh(class_names, values, label=metric_name)

plt.xlabel('Score')
plt.ylabel('Performance Metrics')
plt.legend()
plt.tight_layout()
plt.show()

Function to predict sentiment and label from input text

In [None]:


def predict_sentiment(input_text):

    # Transform input text using the fitted TF-IDF vectorizer
    input_text_tfidf = tfidf_vectorizer.transform([input_text])


    predicted_label = dt_model.predict(input_text_tfidf)[0]

    # Map predicted label back to sentiment

    predicted_sentiment = unique_sentiments[predicted_label]

    return predicted_sentiment, predicted_label


Pratical example by prompting a user.

In [None]:

user_input = input("Enter a text to be analyzed: ")
predicted_sentiment, predicted_label = predict_sentiment(user_input)

print(f"Predicted Sentiment: {predicted_sentiment}")
print(f"Predicted Label: {predicted_label}")
