# SL 3 MINI PROJECT - Sentiment Analysis

In [4]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.preprocessing import FunctionTransformer

In [5]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
# Load the dataset
df = pd.read_csv('train.csv')

In [7]:
# Drop rows with NaN values
df.dropna(inplace=True)

In [8]:
X = df['selected_text']
y = df['sentiment']

In [9]:
# Tokenization and Lemmatization function
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

In [10]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
# Define the pipelines for MultinomialNB and GaussianNB
pipeline_multinomial = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize_and_lemmatize)),
    ('clf', MultinomialNB())
])

pipeline_gaussian = Pipeline([
    ('tfidf', TfidfVectorizer(tokenizer=tokenize_and_lemmatize)),
    ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),
    ('clf', GaussianNB())
])


In [12]:
# Train the models
pipeline_multinomial.fit(X_train, y_train)
pipeline_gaussian.fit(X_train, y_train)


In [13]:
# Evaluate the models
accuracy_multinomial = pipeline_multinomial.score(X_test, y_test)
accuracy_gaussian = pipeline_gaussian.score(X_test, y_test)


In [14]:
print("Multinomial Naive Bayes Accuracy:", accuracy_multinomial)
print("Gaussian Naive Bayes Accuracy:", accuracy_gaussian)


Multinomial Naive Bayes Accuracy: 0.7800218340611353
Gaussian Naive Bayes Accuracy: 0.5844250363901019


In [15]:
# Make predictions
new_texts = ["This product is great", "I'm really disappointed with the service"]
predictions_multinomial = pipeline_multinomial.predict(new_texts)
predictions_gaussian = pipeline_gaussian.predict(new_texts)

In [16]:
print("Multinomial Naive Bayes Predictions:", predictions_multinomial)
print("Gaussian Naive Bayes Predictions:", predictions_gaussian)

Multinomial Naive Bayes Predictions: ['positive' 'negative']
Gaussian Naive Bayes Predictions: ['positive' 'negative']


In [None]:
!pip install pycaret mlflow



In [2]:
import pandas as pd
from pycaret.classification import *

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('wordnet')

# Load your dataset, replace 'your_dataset.csv' with your dataset file
df = pd.read_csv('train.csv')

# Drop rows with NaN values
df.dropna(inplace=True)

# Assuming your dataset has 'text' column for input and 'sentiment' column for labels
data = df[['selected_text', 'sentiment']]

# Initialize PyCaret setup for sentiment analysis
exp_clf = setup(data=data, target='sentiment', session_id=123, log_experiment=True, experiment_name='sentiment_analysis')

# Compare models
best_model = compare_models()

# Print best model
print(best_model)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Description,Value
0,Session id,123
1,Target,sentiment
2,Target type,Multiclass
3,Target mapping,"negative: 0, neutral: 1, positive: 2"
4,Original data shape,"(27480, 2)"
5,Transformed data shape,"(27480, 2)"
6,Transformed train set shape,"(19236, 2)"
7,Transformed test set shape,"(8244, 2)"
8,Categorical features,1
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.6021,0.0,0.6021,0.7947,0.5664,0.3542,0.4619,0.625
dt,Decision Tree Classifier,0.602,0.0,0.602,0.7944,0.5662,0.3541,0.4617,0.234
et,Extra Trees Classifier,0.602,0.0,0.602,0.7944,0.5662,0.3541,0.4617,0.452
nb,Naive Bayes,0.6019,0.0,0.6019,0.7948,0.5661,0.3539,0.4617,0.252
svm,SVM - Linear Kernel,0.6019,0.0,0.6019,0.7945,0.5661,0.3539,0.4616,0.247
rf,Random Forest Classifier,0.6019,0.0,0.6019,0.7944,0.5662,0.354,0.4617,0.521
qda,Quadratic Discriminant Analysis,0.6019,0.0,0.6019,0.7948,0.5661,0.3539,0.4617,0.136
ada,Ada Boost Classifier,0.6019,0.0,0.6019,0.7943,0.5662,0.354,0.4616,0.39
gbc,Gradient Boosting Classifier,0.6019,0.0,0.6019,0.7943,0.5662,0.354,0.4616,1.363
xgboost,Extreme Gradient Boosting,0.6019,0.0,0.6019,0.7943,0.5662,0.354,0.4616,0.476


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=5, p=2,
                     weights='uniform')
