# **Importing Liabraries**

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

#!pip install contractions

import pandas as pd
import matplotlib.pyplot as plt
#import plotly.graph_objects as go
import numpy as np
import contractions
import re

import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.filterwarnings('ignore')

# **Data loading and Cleaning**

In [None]:
#loading the dataset and show first 5 rows

df = pd.read_csv(r'C:\Users\ritik\Downloads\archive (12)\training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1')
random_sample = df.sample(n=5000)
random_sample.columns = ['sentiment','text id','Time of Tweet', 'flag', 'Account Name', 'text']
random_sample.head()

In [None]:
random_sample.info()

In [None]:
random_sample.columns

In [None]:
random_sample.columns = random_sample.columns.str.strip()
print(random_sample.columns)

In [None]:
#dropping blanks and unnecessary column
random_sample = random_sample.dropna()
random_sample = random_sample.drop(['sentiment','text id','Account Name','flag'], axis = 1)

In [None]:
random_sample['Time of Tweet'] = pd.to_datetime(random_sample['Time of Tweet'], format='%a %b %d %H:%M:%S PDT %Y')
random_sample.info()

In [None]:
random_sample.sample(5)

# **Text Pre - Processing**

In [None]:

def preprocess_text(text):

    # Convert to lowercase
    text = text.lower()

    # Expand contractions
    text = contractions.fix(text)

    # Remove URLs
    text = re.sub(r'(http|https|www.)\S+', '', text)

    # Remove Twitter handles and hashtags
    text = re.sub(r'[@#]\w+', '', text)

    # Tokenize the tweet
    tokens = word_tokenize(text)

    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words and word not in string.punctuation]

    # Join tokens back into a sentence
    processed_text = ' '.join(tokens)

    # Remove digits
    processed_text = re.sub(r'\d+', '', processed_text)

    # Remove non-alphabetic characters
    processed_text = re.sub(r'[^a-zA-Z\s]', '', processed_text)

    # Remove repeated characters (e.g., "aaah" becomes "ah")
    processed_text = re.sub(r"(.)\1\1+", r"\1\1", processed_text)

    return processed_text

# Apply preprocessing to text column
random_sample['Processed_Text'] = random_sample['text'].apply(preprocess_text)

In [None]:
#checking the column
random_sample['Processed_Text'] [:10].to_frame()

In [None]:
random_sample.sample(5)

# **Sentiment Tagging**

In [None]:
sid = SentimentIntensityAnalyzer()

# Calculate sentiment scores
random_sample['Sentiment_Score'] = random_sample['Processed_Text'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Map sentiment scores to sentiment labels
def get_sentiment_label(score):
    if score > 0.05:
        return 'positive'
    elif score < -0.05:
        return 'negative'
    return 'neutral'

random_sample['Sentiment_Label'] = random_sample['Sentiment_Score'].apply(get_sentiment_label)

In [None]:
random_sample.sample(5)

# **Building and Evaluating SVM model**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Assuming you have a DataFrame 'df' with 'Processed_Text' and 'Sentiment_Label' columns
X = random_sample['Processed_Text']
y = random_sample['Sentiment_Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorizer and SVM classifier
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC(kernel='linear'))
])

# Train the model
model.fit(X_train, y_train)

# Predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))



In [None]:
# Experimenting with random prompt
prompt = "I love this specilization"
predicted_sentiment = model.predict([prompt])[0]
print(f"Predicted Sentiment: {predicted_sentiment}")

# **Building and Evaluating Random Forest Clasifier Model**

In [None]:


from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Assuming you have a DataFrame 'df' with 'Processed_Text' and 'Sentiment_Label' columns
X = random_sample['Processed_Text']
y = random_sample['Sentiment_Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorizer and Random Forest classifier
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))




In [None]:
# Experimenting with random prompt
prompt = "today is my project reviw i wish it will go well"
predicted_sentiment = model.predict([prompt])[0]
print(f"Predicted Sentiment: {predicted_sentiment}")

# **Building and Evaluating Naive Baye's Model**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Assuming you have a DataFrame 'df' with 'Processed_Text' and 'Sentiment_Label' columns
X = random_sample['Processed_Text']
y = random_sample['Sentiment_Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorizer and Multinomial Naive Bayes classifier
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# Train the model
model.fit(X_train, y_train)

# Predictions on the test set
predictions = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))

# Example usage for making predictions on new data
prompt = "I love this product! It's amazing."
predicted_sentiment = model.predict([prompt])[0]
print(f"Predicted Sentiment: {predicted_sentiment}")


# **Classification Report**

In [None]:
# Data
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 score']
classifiers = ['SVM', 'RF', 'NB']
values = [[0.92, 0.93, 0.92, 0.93], [0.89, 0.89, 0.90, 0.90], [0.73, 0.79, 0.70, 0.74]]
colors = ['rgb(44, 160, 44)', 'rgba(255, 127, 14, 0.7)', '#1f77b4']

# Plotly Interactive Bar Chart
fig = go.Figure()

for i in range(len(classifiers)):
    fig.add_trace(go.Bar(x=metrics, y=values[i], name=classifiers[i], marker_color=colors[i]))

fig.update_layout(title='Classification Report',
                  xaxis=dict(title='Metrics'),
                  yaxis=dict(title='Value'),
                  barmode='group')

fig.show()

# **Distribution of Sentiments**

In [None]:
sentiment_counts = random_sample['Sentiment_Label'].value_counts()
print(sentiment_counts)
time_stampp = random_sample['Time of Tweet'].describe()
print(time_stampp)

In [None]:
 # Bar plot for Sentiment
plt.figure(figsize=(8, 6))
color = sns.color_palette()[0]
order = random_sample['Sentiment_Label'].value_counts().index
ax = sns.countplot(data=df, x='Sentiment_Label', color=color, order=order)
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.title('Distribution of Sentiments')
ax.bar_label(ax.containers[0], fmt='%.0f', label_type='edge')
plt.show()



```
# This is formatted as code
```

# **Just a Curiousity**

In [None]:


random_sample['jesus_count'] = random_sample['Processed_Text'].apply(lambda x: x.lower().count('jesus'))

# Group by sentiment label and calculate total counts
word_counts_by_sentiment = random_sample.groupby('Sentiment_Label')[['jesus_count']].sum().reset_index()

# Plot the results
fig, ax = plt.subplots(figsize=(10, 6))
word_counts_by_sentiment.plot(kind='bar', x='Sentiment_Label', ax=ax, colormap='viridis', stacked=True)
plt.title('Word Counts by Sentiment Label')
plt.xlabel('Sentiment Label')
plt.ylabel('Word Count')
plt.show()


## Getting the TOP 20 words by count and their weightage percentage

In [None]:


documents = random_sample['Processed_Text']

# Vectorize the text using CountVectorizer
vectorizer = CountVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(documents)

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Calculate the total count for each word
word_counts = X.sum(axis=0).A1

# Create a DataFrame with words and their counts
word_counts_df = pd.DataFrame({'Word': feature_names, 'Count': word_counts})

# Sort the DataFrame by count in descending order
word_counts_df = word_counts_df.sort_values(by='Count', ascending=False)

# Calculate weighted percentages
total_words = word_counts_df['Count'].sum()
word_counts_df['Weighted_Percentage'] = (word_counts_df['Count'] / total_words) * 100

# Get the top 20 words
top_20_words = word_counts_df.head(20)

# Plot the results
fig, ax = plt.subplots(figsize=(12, 8))
top_20_words.plot(kind='bar', x='Word', y='Count', ax=ax, colormap='viridis')
ax2 = ax.twinx()
top_20_words.plot(kind='line', x='Word', y='Weighted_Percentage', ax=ax2, color='orange', marker='o')
ax.set_title('Top 20 Words with Counts and Weighted Percentages')
ax.set_ylabel('Count')
ax2.set_ylabel('Weighted Percentage', color='orange')
plt.show()


# Generating Word cloud for better visualization

In [None]:


text_data = ' '.join(random_sample['Processed_Text'])

# Generate the word cloud with customizations
wordcloud = WordCloud(
    width=800,
    height=400,
    random_state=42,
    background_color='black',
    colormap='inferno',  # Use a different colormap
    contour_color='steelblue',  # Color of the word cloud outline
    contour_width=2,  # Width of the word cloud outline
    max_words=200,  # Maximum number of words in the cloud
).generate(text_data)

# Display the word cloud using matplotlib
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()
