In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('data.csv')

# Extract only the 'class' (sentiment) and the 'tweet' columns
new_data = data[['class', 'tweet']].copy()

# Rename the 'tweet' column as 'text'
new_data.rename(columns={'tweet': 'text'}, inplace=True)

# Clean text function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    text = re.sub(r'[^a-z\s]', '', text) # Remove non-alphabetic characters
    text = re.sub(r'\n+', ' ', text)     # Remove newlines
    return text

# Apply the clean_text function to the 'text' column
new_data['clean_text'] = new_data['text'].apply(clean_text)
print("Updated Data:")
print(new_data.head(10))

# Split the data into training and testing sets
train_data, test_data = train_test_split(new_data, test_size=0.33, random_state=42)
print(f'Training Data Size: {len(train_data)}')
print(f'Testing Data Size: {len(test_data)}')

In [None]:
# EDA: Class Distribution in the training set
print("\nClass Distribution in Training Set:")
print(train_data['class'].value_counts())

# Custom labels for the sentiment classes
class_labels = ['Hate Speech', 'Offensive Language', 'Normal']

# Plot class distribution using a bar plot
plt.figure(figsize=(8, 5))  # Set the figure size
ax = sns.countplot(x='class', data=train_data, palette='viridis')  # Create the count plot with custom colors
ax.set_title('Class Distribution in Training Set')  # Set the title of the plot
ax.set_xlabel('Sentiment')  # Label the x-axis
ax.set_ylabel('Count')  # Label the y-axis
ax.set_xticks(range(len(class_labels)))  # Set the x-axis tick positions
ax.set_xticklabels(class_labels)  # Set the x-axis tick labels

# Display the bar plot
plt.show()

# Bubble plot for class distribution in the training set
class_counts = train_data['class'].value_counts().sort_index()  # Get the counts of each class
plt.figure(figsize=(10, 6))  # Set the figure size
colors = sns.color_palette('viridis', len(class_counts))  # Get a color palette for the bubbles
bubble_sizes = class_counts / class_counts.max() * 10000  # Adjust the scaling factor for bubble sizes

# Create the bubble plot
for i, (label, count) in enumerate(zip(class_labels, class_counts)):
    plt.scatter(i * 0.5, 1, s=bubble_sizes[i], c=[colors[i]], alpha=0.6, edgecolors="w", linewidth=2) 
    plt.text(i * 0.5, 1, f'{label}\n{count}', ha='center', va='center', fontsize=12, color='black')  # Annotate each bubble with the class label and count

# Set the title and remove axis labels and ticks for a cleaner look
plt.title('Bubble Plot of Class Distribution in Training Set')
plt.xlabel('')
plt.ylabel('')
plt.xticks([])  # Remove x-axis ticks
plt.yticks([])  # Remove y-axis ticks
plt.grid(False)  # Remove the grid

# Adjust the x-axis limits to ensure all bubbles are visible
plt.xlim(-0.5, 1.5)

# Display the bubble plot
plt.show()

In [None]:
3. BOW Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the training data
X_train_bow = vectorizer.fit_transform(train_data['clean_text'])

# Transform the testing data
X_test_bow = vectorizer.transform(test_data['clean_text'])

# Print the shape of the resulting matrices
print(f'Shape of X_train_bow: {X_train_bow.shape}')
print(f'Shape of X_test_bow: {X_test_bow.shape}')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Train the Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_bow, train_data['class'])
y_pred_log_reg = log_reg.predict(X_test_bow)

# Calculate the accuracy of the Logistic Regression model and convert to percentage
accuracy_log_reg = accuracy_score(test_data['class'], y_pred_log_reg) * 100
print(f'Accuracy (Logistic Regression): {accuracy_log_reg:.2f}%')

# Print the classification report for the Logistic Regression model
class_report_log_reg = classification_report(test_data['class'], y_pred_log_reg, target_names=class_labels, output_dict=True)
print('Classification Report (Logistic Regression, in %):')
for label, metrics in class_report_log_reg.items():
    if label in class_labels:
        print(f'{label}:')
        print(f"  Precision: {metrics['precision'] * 100:.2f}%")
        print(f"  Recall: {metrics['recall'] * 100:.2f}%")
        print(f"  F1-score: {metrics['f1-score'] * 100:.2f}%")
        print(f"  Support: {metrics['support']}")

# Compute the confusion matrix for the Logistic Regression model
conf_matrix_log_reg = confusion_matrix(test_data['class'], y_pred_log_reg)

# Normalize the confusion matrix to percentages
conf_matrix_normalized_log_reg = conf_matrix_log_reg.astype('float') / conf_matrix_log_reg.sum(axis=1)[:, np.newaxis] * 100

# Plot the normalized confusion matrix for the Logistic Regression model
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_normalized_log_reg, annot=True, fmt='.2f', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion Matrix (Logistic Regression, in %)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


In [None]:
from sklearn.svm import SVC

# Initialize the SVM classifier
svm_clf = SVC(kernel='linear', C=1, max_iter=1000)

# Train the model on the training data
svm_clf.fit(X_train_bow, train_data['class'])

# Make predictions on the test data
y_pred_svm = svm_clf.predict(X_test_bow)

# Calculate the accuracy of the SVM model and convert to percentage
accuracy_svm = accuracy_score(test_data['class'], y_pred_svm) * 100
print(f'Accuracy (SVM): {accuracy_svm:.2f}%')

# Print the classification report for the SVM model
class_report_svm = classification_report(test_data['class'], y_pred_svm, target_names=class_labels, output_dict=True)
print('Classification Report (SVM, in %):')
for label, metrics in class_report_svm.items():
    if label in class_labels:
        print(f'{label}:')
        print(f"  Precision: {metrics['precision'] * 100:.2f}%")
        print(f"  Recall: {metrics['recall'] * 100:.2f}%")
        print(f"  F1-score: {metrics['f1-score'] * 100:.2f}%")
        print(f"  Support: {metrics['support']}")

# Compute the confusion matrix for the SVM model
conf_matrix_svm = confusion_matrix(test_data['class'], y_pred_svm)

# Normalize the confusion matrix to percentages
conf_matrix_normalized_svm = conf_matrix_svm.astype('float') / conf_matrix_svm.sum(axis=1)[:, np.newaxis] * 100

# Plot the normalized confusion matrix for the SVM model
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_normalized_svm, annot=True, fmt='.2f', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion Matrix (SVM, in %)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree classifier
dt_clf = DecisionTreeClassifier(random_state=42)

# Train the model on the training data
dt_clf.fit(X_train_bow, train_data['class'])

# Make predictions on the test data
y_pred_dt = dt_clf.predict(X_test_bow)

# Calculate the accuracy of the Decision Tree model and convert to percentage
accuracy_dt = accuracy_score(test_data['class'], y_pred_dt) * 100
print(f'Accuracy (Decision Tree): {accuracy_dt:.2f}%')

# Print the classification report for the Decision Tree model
class_report_dt = classification_report(test_data['class'], y_pred_dt, target_names=class_labels, output_dict=True)
print('Classification Report (Decision Tree, in %):')
for label, metrics in class_report_dt.items():
    if label in class_labels:
        print(f'{label}:')
        print(f"  Precision: {metrics['precision'] * 100:.2f}%")
        print(f"  Recall: {metrics['recall'] * 100:.2f}%")
        print(f"  F1-score: {metrics['f1-score'] * 100:.2f}%")
        print(f"  Support: {metrics['support']}")

# Compute the confusion matrix for the Decision Tree model
conf_matrix_dt = confusion_matrix(test_data['class'], y_pred_dt)

# Normalize the confusion matrix to percentages
conf_matrix_normalized_dt = conf_matrix_dt.astype('float') / conf_matrix_dt.sum(axis=1)[:, np.newaxis] * 100

# Plot the normalized confusion matrix for the Decision Tree model
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_normalized_dt, annot=True, fmt='.2f', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion Matrix (Decision Tree, in %)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

In [None]:
# Compare the accuracies of the Logistic Regression, SVM, and Decision Tree models
print(f'Logistic Regression Accuracy: {accuracy_log_reg:.2f}%')
print(f'SVM Accuracy: {accuracy_svm:.2f}%')
print(f'Decision Tree Accuracy: {accuracy_dt:.2f}%')

# Store the results in a dictionary
accuracy_results = {
    'Logistic Regression': accuracy_log_reg,
    'Support Vector Machine (SVM)': accuracy_svm,
    'Decision Tree': accuracy_dt,
}

# Print the accuracy results
print('Accuracy Comparison:')
for model, accuracy in accuracy_results.items():
    print(f'{model}: {accuracy:.2f}%')

# Plot the accuracies for comparison
plt.figure(figsize=(10, 6))
plt.bar(accuracy_results.keys(), accuracy_results.values(), color=['blue', 'green', 'orange'])
plt.title('Comparison of Model Accuracies')
plt.ylabel('Accuracy (%)')
plt.ylim(0, 100)
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the parameter grid for Logistic Regression
# 'C' is the regularization parameter. Lower values specify stronger regularization.
# 'solver' is the algorithm to use in the optimization problem.
# 'max_iter' is the maximum number of iterations taken for the solvers to converge.
param_grid_log_reg = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],  # Limiting solvers to avoid convergence issues
    'max_iter': [1000, 2000, 3000, 5000]  # Increased max_iter values to ensure convergence
}

# Initialize the Logistic Regression model
log_reg = LogisticRegression()

# Initialize GridSearchCV with cross-validation
# GridSearchCV is a method for performing hyperparameter tuning.
# It exhaustively searches over a specified parameter grid to find the best parameters.
# cv=5 means 5-fold cross-validation, scoring='accuracy' means we evaluate the models based on accuracy.
# verbose=2 gives detailed logs of the search process, n_jobs=-1 uses all available cores for computation.
grid_search_log_reg = GridSearchCV(estimator=log_reg, param_grid=param_grid_log_reg, cv=5, scoring='accuracy', verbose=2, n_jobs=-1)

# Fit the grid search to the training data
# This step performs the exhaustive search over the parameter grid and fits the models.
grid_search_log_reg.fit(X_train_bow, train_data['class'])

# Print the best parameters and the best score found by GridSearchCV
print(f'Best Parameters: {grid_search_log_reg.best_params_}')
print(f'Best Cross-Validation Score: {grid_search_log_reg.best_score_}')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Train the updated Logistic Regression model with the best parameters
best_log_reg = grid_search_log_reg.best_estimator_
best_log_reg.fit(X_train_bow, train_data['class'])
y_pred_updated = best_log_reg.predict(X_test_bow)

# Calculate the accuracy of the updated Logistic Regression model and convert to percentage
accuracy_updated = accuracy_score(test_data['class'], y_pred_updated) * 100
print(f'Accuracy (Updated Logistic Regression): {accuracy_updated:.2f}%')

# Print the classification report for the updated Logistic Regression model
class_report_updated = classification_report(test_data['class'], y_pred_updated, target_names=class_labels, output_dict=True)
print('Classification Report (Updated Logistic Regression, in %):')
for label, metrics in class_report_updated.items():
    if label in class_labels:
        print(f'{label}:')
        print(f"  Precision: {metrics['precision'] * 100:.2f}%")
        print(f"  Recall: {metrics['recall'] * 100:.2f}%")
        print(f"  F1-score: {metrics['f1-score'] * 100:.2f}%")
        print(f"  Support: {metrics['support']}")

# Compute the confusion matrix for the updated Logistic Regression model
conf_matrix_updated = confusion_matrix(test_data['class'], y_pred_updated)

# Normalize the confusion matrix to percentages
conf_matrix_normalized_updated = conf_matrix_updated.astype('float') / conf_matrix_updated.sum(axis=1)[:, np.newaxis] * 100

# Plot the normalized confusion matrix for the updated Logistic Regression model
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_normalized_updated, annot=True, fmt='.2f', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
plt.title('Confusion Matrix (Updated Logistic Regression, in %)')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()


In [None]:
# Calculate the accuracy for each model
accuracy_log_reg_original = accuracy_score(test_data['class'], y_pred_log_reg) * 100
accuracy_log_reg_updated = accuracy_score(test_data['class'], y_pred_updated) * 100

# Store the results in a dictionary
accuracy_results = {
    'Logistic Regression (Original)': accuracy_log_reg_original,
    'Logistic Regression (with t)': accuracy_log_reg_updated,
    'Support Vector Machine (SVM)': accuracy_svm,
    'Decision Tree': accuracy_dt
}

# Print the accuracy results
print('Accuracy Comparison:')
for model, accuracy in accuracy_results.items():
    print(f'{model}: {accuracy:.2f}%')

# Plot the accuracies for comparison
plt.figure(figsize=(10, 6))
plt.bar(accuracy_results.keys(), accuracy_results.values(), color=['blue', 'green', 'orange', 'red'])
plt.title('Comparison of Model Accuracies')
plt.ylabel('Accuracy (%)')
plt.ylim(0, 100)
plt.xticks(rotation=45)
plt.show()

In [None]:
import re
from sklearn.feature_extraction.text import CountVectorizer

class_labels = ['Hate Speech', 'Offensive Language', 'Normal']

# Function to predict sentiment of input text
def predict_sentiment(input_text):
    # Clean the input text using function in step 1
    clean_input = clean_text(input_text)
    
    # Transform the cleaned input text to bag-of-words representation
    input_vector = vectorizer.transform([clean_input])
    
    # Predict the sentiment class using the trained regression model
    predicted_class = best_log_reg.predict(input_vector)
    
    # Get the sentiment label
    sentiment_label = class_labels[predicted_class[0]]
    
    return sentiment_label

# Example usage
input_text = input("Enter a tweet to predict its sentiment: ")
predicted_sentiment = predict_sentiment(input_text)
print(f'Predicted Sentiment: {predicted_sentiment}')

In [None]:
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

# Assuming 'train_data' and 'test_data' are already defined
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(train_data['clean_text'])
X_test_bow = vectorizer.transform(test_data['clean_text'])

# Train Logistic Regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_bow, train_data['class'])

# Save the trained model and the vectorizer
joblib.dump(log_reg, 'best_log_reg.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')