--------------------------------  porter stemmer with pos + stopwords + tokenize _ tf-idf vectorizer -------------------------------

In [3]:
import nltk
from nltk.corpus import stopwords
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

train_sentiments = pd.read_csv("/kaggle/input/sent-analysis/train/train.csv")
test_sentiments = pd.read_csv("/kaggle/input/sent-analysis/test/test.csv")

# Text preprocessing function with stemming and POS tagging
def preprocess_review(review):
    tokens = word_tokenize(review)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Use PorterStemmer for stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Perform POS tagging
    pos_tags = pos_tag(tokens)
    tokens = [word + '_' + pos for word, pos in pos_tags]

    return ' '.join(tokens)

train_sentiments['processed_reviews'] = train_sentiments['review'].apply(preprocess_review)
test_sentiments['processed_reviews'] = test_sentiments['review'].apply(preprocess_review)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_tfidf = tfidf_vectorizer.fit_transform(train_sentiments['processed_reviews'])
test_tfidf = tfidf_vectorizer.transform(test_sentiments['processed_reviews'] )

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(train_tfidf, train_sentiments['sentiment'])
nb_predictions = nb_model.predict(test_tfidf)
nb_accuracy = accuracy_score(test_sentiments['sentiment'], nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Classification Report for Naive Bayes:\n", classification_report(test_sentiments['sentiment'], nb_predictions))
print(nb_predictions)
print("/n")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_tfidf, train_sentiments['sentiment'])
rf_predictions = rf_model.predict(test_tfidf)
rf_accuracy = accuracy_score(test_sentiments['sentiment'], rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Classification Report for Random Forest:\n", classification_report(test_sentiments['sentiment'], rf_predictions))
print(rf_predictions)

Naive Bayes Accuracy: 0.8469
Classification Report for Naive Bayes:
               precision    recall  f1-score   support

    negative       0.85      0.84      0.85      9935
    positive       0.85      0.85      0.85     10065

    accuracy                           0.85     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.85      0.85      0.85     20000

['positive' 'positive' 'negative' ... 'negative' 'positive' 'negative']
/n
Random Forest Accuracy: 0.83375
Classification Report for Random Forest:
               precision    recall  f1-score   support

    negative       0.83      0.84      0.83      9935
    positive       0.84      0.82      0.83     10065

    accuracy                           0.83     20000
   macro avg       0.83      0.83      0.83     20000
weighted avg       0.83      0.83      0.83     20000

['positive' 'positive' 'negative' ... 'negative' 'positive' 'negative']


In [5]:
# k-NN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_tfidf, train_sentiments['sentiment'])
knn_predictions = knn_model.predict(test_tfidf)
knn_accuracy = accuracy_score(test_sentiments['sentiment'], knn_predictions)
print("k-NN Accuracy:", knn_accuracy)
print("Classification Report for k-NN:\n", classification_report(test_sentiments['sentiment'], knn_predictions))
print("/n")

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(train_tfidf, train_sentiments['sentiment'])
dt_predictions = dt_model.predict(test_tfidf)
dt_accuracy = accuracy_score(test_sentiments['sentiment'], dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print("Classification Report for Decision Tree:\n", classification_report(test_sentiments['sentiment'], dt_predictions))
print("/n")

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(train_tfidf, train_sentiments['sentiment'])
gb_predictions = gb_model.predict(test_tfidf)
gb_accuracy = accuracy_score(test_sentiments['sentiment'], gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Classification Report for Gradient Boosting:\n", classification_report(test_sentiments['sentiment'], gb_predictions))

k-NN Accuracy: 0.66965
Classification Report for k-NN:
               precision    recall  f1-score   support

    negative       0.65      0.74      0.69      9935
    positive       0.70      0.60      0.64     10065

    accuracy                           0.67     20000
   macro avg       0.67      0.67      0.67     20000
weighted avg       0.67      0.67      0.67     20000

/n
Decision Tree Accuracy: 0.69575
Classification Report for Decision Tree:
               precision    recall  f1-score   support

    negative       0.69      0.70      0.70      9935
    positive       0.70      0.69      0.70     10065

    accuracy                           0.70     20000
   macro avg       0.70      0.70      0.70     20000
weighted avg       0.70      0.70      0.70     20000

/n
Gradient Boosting Accuracy: 0.79635
Classification Report for Gradient Boosting:
               precision    recall  f1-score   support

    negative       0.83      0.75      0.78      9935
    positive       

In [6]:

# Stacking
stacking_model = StackingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model),('gb', gb_model)],
    final_estimator=GradientBoostingClassifier(n_estimators=100, random_state=42)
)
stacking_model.fit(train_tfidf, train_sentiments['sentiment'])
stacking_predictions = stacking_model.predict(test_tfidf)
stacking_accuracy = accuracy_score(test_sentiments['sentiment'], stacking_predictions)
print("Stacking Accuracy:", stacking_accuracy)
print("Classification Report for Stacking:\n", classification_report(test_sentiments['sentiment'], stacking_predictions))
print("/n")

Stacking Accuracy: 0.8613
Classification Report for Stacking:
               precision    recall  f1-score   support

    negative       0.86      0.86      0.86      9935
    positive       0.86      0.87      0.86     10065

    accuracy                           0.86     20000
   macro avg       0.86      0.86      0.86     20000
weighted avg       0.86      0.86      0.86     20000

/n




KeyboardInterrupt: 

In [7]:

# Neural Network
nn_model = MLPClassifier(hidden_layer_sizes=(64,), max_iter=100, random_state=42)
nn_model.fit(train_tfidf, train_sentiments['sentiment'])
nn_predictions = nn_model.predict(test_tfidf)
nn_accuracy = accuracy_score(test_sentiments['sentiment'], nn_predictions)
print("Neural Network Accuracy:", nn_accuracy)
print("Classification Report for Neural Network:\n", classification_report(test_sentiments['sentiment'], nn_predictions))


Neural Network Accuracy: 0.8495
Classification Report for Neural Network:
               precision    recall  f1-score   support

    negative       0.85      0.85      0.85      9935
    positive       0.85      0.85      0.85     10065

    accuracy                           0.85     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.85      0.85      0.85     20000



In [None]:
# Bagging
bagging_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
bagging_model.fit(train_tfidf, train_sentiments['sentiment'])
bagging_predictions = bagging_model.predict(test_tfidf)
bagging_accuracy = accuracy_score(test_sentiments['sentiment'], bagging_predictions)
print("Bagging Accuracy:", bagging_accuracy)
print("Classification Report for Bagging:\n", classification_report(test_sentiments['sentiment'], bagging_predictions))
print("/n")

Bagging Accuracy: 0.7925
Classification Report for Bagging:
               precision    recall  f1-score   support

    negative       0.79      0.79      0.79      9935
    positive       0.79      0.79      0.79     10065

    accuracy                           0.79     20000
   macro avg       0.79      0.79      0.79     20000
weighted avg       0.79      0.79      0.79     20000

/n


------------------------  porter stemmer with pos with count vectorizer -----------------------------

In [1]:
import nltk
from nltk.corpus import stopwords
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

train_sentiments = pd.read_csv("/kaggle/input/sent-analysis/train/train.csv")
test_sentiments = pd.read_csv("/kaggle/input/sent-analysis/test/test.csv")

# Text preprocessing function with stemming and POS tagging
def preprocess_review(review):
    tokens = word_tokenize(review)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Use PorterStemmer for stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Perform POS tagging
    pos_tags = pos_tag(tokens)
    tokens = [word + '_' + pos for word, pos in pos_tags]

    return ' '.join(tokens)

train_sentiments['processed_reviews'] = train_sentiments['review'].apply(preprocess_review)
test_sentiments['processed_reviews'] = test_sentiments['review'].apply(preprocess_review)

# Vectorize the text data using CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)
train_tfidf = count_vectorizer.fit_transform(train_sentiments['processed_reviews'])
test_tfidf = count_vectorizer.transform(test_sentiments['processed_reviews'])

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(train_tfidf, train_sentiments['sentiment'])
nb_predictions = nb_model.predict(test_tfidf)
nb_accuracy = accuracy_score(test_sentiments['sentiment'], nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Classification Report for Naive Bayes:\n", classification_report(test_sentiments['sentiment'], nb_predictions))
print(nb_predictions)
print("/n")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_tfidf, train_sentiments['sentiment'])
rf_predictions = rf_model.predict(test_tfidf)
rf_accuracy = accuracy_score(test_sentiments['sentiment'], rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Classification Report for Random Forest:\n", classification_report(test_sentiments['sentiment'], rf_predictions))
print(rf_predictions)

# k-NN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_tfidf, train_sentiments['sentiment'])
knn_predictions = knn_model.predict(test_tfidf)
knn_accuracy = accuracy_score(test_sentiments['sentiment'], knn_predictions)
print("k-NN Accuracy:", knn_accuracy)
print("Classification Report for k-NN:\n", classification_report(test_sentiments['sentiment'], knn_predictions))
print("/n")

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(train_tfidf, train_sentiments['sentiment'])
dt_predictions = dt_model.predict(test_tfidf)
dt_accuracy = accuracy_score(test_sentiments['sentiment'], dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print("Classification Report for Decision Tree:\n", classification_report(test_sentiments['sentiment'], dt_predictions))
print("/n")

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(train_tfidf, train_sentiments['sentiment'])
gb_predictions = gb_model.predict(test_tfidf)
gb_accuracy = accuracy_score(test_sentiments['sentiment'], gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Classification Report for Gradient Boosting:\n", classification_report(test_sentiments['sentiment'], gb_predictions))


# Stacking
stacking_model = StackingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model),('gb', gb_model)],
    final_estimator=GradientBoostingClassifier(n_estimators=100, random_state=42)
)
stacking_model.fit(train_tfidf, train_sentiments['sentiment'])
stacking_predictions = stacking_model.predict(test_tfidf)
stacking_accuracy = accuracy_score(test_sentiments['sentiment'], stacking_predictions)
print("Stacking Accuracy:", stacking_accuracy)
print("Classification Report for Stacking:\n", classification_report(test_sentiments['sentiment'], stacking_predictions))
print("/n")

# Bagging
bagging_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
bagging_model.fit(train_tfidf, train_sentiments['sentiment'])
bagging_predictions = bagging_model.predict(test_tfidf)
bagging_accuracy = accuracy_score(test_sentiments['sentiment'], bagging_predictions)
print("Bagging Accuracy:", bagging_accuracy)
print("Classification Report for Bagging:\n", classification_report(test_sentiments['sentiment'], bagging_predictions))
print("/n")

# Voting Classifier
voting_model = VotingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model), ('gb', gb_model)],
    voting='hard'
)
voting_model.fit(train_tfidf, train_sentiments['sentiment'])
voting_predictions = voting_model.predict(test_tfidf)
voting_accuracy = accuracy_score(test_sentiments['sentiment'], voting_predictions)
print("Voting Classifier Accuracy:", voting_accuracy)
print("Classification Report for Voting Classifier:\n", classification_report(test_sentiments['sentiment'], voting_predictions))


# Neural Network
nn_model = MLPClassifier(hidden_layer_sizes=(64,), max_iter=100, random_state=42)
nn_model.fit(train_tfidf, train_sentiments['sentiment'])
nn_predictions = nn_model.predict(test_tfidf)
nn_accuracy = accuracy_score(test_sentiments['sentiment'], nn_predictions)
print("Neural Network Accuracy:", nn_accuracy)
print("Classification Report for Neural Network:\n", classification_report(test_sentiments['sentiment'], nn_predictions))


Naive Bayes Accuracy: 0.83615
Classification Report for Naive Bayes:
               precision    recall  f1-score   support

    negative       0.83      0.84      0.84      9935
    positive       0.84      0.83      0.84     10065

    accuracy                           0.84     20000
   macro avg       0.84      0.84      0.84     20000
weighted avg       0.84      0.84      0.84     20000

['positive' 'positive' 'negative' ... 'negative' 'positive' 'negative']
/n
Random Forest Accuracy: 0.8348
Classification Report for Random Forest:
               precision    recall  f1-score   support

    negative       0.83      0.84      0.83      9935
    positive       0.84      0.83      0.83     10065

    accuracy                           0.83     20000
   macro avg       0.83      0.83      0.83     20000
weighted avg       0.83      0.83      0.83     20000

['positive' 'positive' 'negative' ... 'negative' 'positive' 'negative']
k-NN Accuracy: 0.58895
Classification Report for k-NN:
 



Bagging Accuracy: 0.7902
Classification Report for Bagging:
               precision    recall  f1-score   support

    negative       0.79      0.78      0.79      9935
    positive       0.79      0.80      0.79     10065

    accuracy                           0.79     20000
   macro avg       0.79      0.79      0.79     20000
weighted avg       0.79      0.79      0.79     20000

/n
Voting Classifier Accuracy: 0.82985
Classification Report for Voting Classifier:
               precision    recall  f1-score   support

    negative       0.83      0.82      0.83      9935
    positive       0.83      0.84      0.83     10065

    accuracy                           0.83     20000
   macro avg       0.83      0.83      0.83     20000
weighted avg       0.83      0.83      0.83     20000

Neural Network Accuracy: 0.85705
Classification Report for Neural Network:
               precision    recall  f1-score   support

    negative       0.86      0.85      0.86      9935
    positive   

----------------- PORTER STEMMER WITHOUT POS _ COUNT VECTORIZER WITH ML MODELS

------------------ LANCASTER STEMMER WITH POS _ TF-IDF VECTORIZATION WITH ML MODELS -------------------------------

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer 
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
train_sentiments = pd.read_csv("/kaggle/input/sent-analysis/train/train.csv")
test_sentiments = pd.read_csv("/kaggle/input/sent-analysis/test/test.csv")

# Text preprocessing function with stemming and POS tagging
def preprocess_review(review):
    tokens = word_tokenize(review)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Use LancasterStemmer for stemming
    stemmer = LancasterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Perform POS tagging
    pos_tags = pos_tag(tokens)
    tokens = [word + '_' + pos for word, pos in pos_tags]

    return ' '.join(tokens)

train_sentiments['processed_reviews'] = train_sentiments['review'].apply(preprocess_review)
test_sentiments['processed_reviews'] = test_sentiments['review'].apply(preprocess_review)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_tfidf = tfidf_vectorizer.fit_transform(train_sentiments['processed_reviews'])
test_tfidf = tfidf_vectorizer.transform(test_sentiments['processed_reviews'] )

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(train_tfidf, train_sentiments['sentiment'])
nb_predictions = nb_model.predict(test_tfidf)
nb_accuracy = accuracy_score(test_sentiments['sentiment'], nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Classification Report for Naive Bayes:\n", classification_report(test_sentiments['sentiment'], nb_predictions))
print(nb_predictions)
print("/n")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_tfidf, train_sentiments['sentiment'])
rf_predictions = rf_model.predict(test_tfidf)
rf_accuracy = accuracy_score(test_sentiments['sentiment'], rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Classification Report for Random Forest:\n", classification_report(test_sentiments['sentiment'], rf_predictions))
print(rf_predictions)

# k-NN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_tfidf, train_sentiments['sentiment'])
knn_predictions = knn_model.predict(test_tfidf)
knn_accuracy = accuracy_score(test_sentiments['sentiment'], knn_predictions)
print("k-NN Accuracy:", knn_accuracy)
print("Classification Report for k-NN:\n", classification_report(test_sentiments['sentiment'], knn_predictions))
print("/n")

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(train_tfidf, train_sentiments['sentiment'])
dt_predictions = dt_model.predict(test_tfidf)
dt_accuracy = accuracy_score(test_sentiments['sentiment'], dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print("Classification Report for Decision Tree:\n", classification_report(test_sentiments['sentiment'], dt_predictions))
print("/n")

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(train_tfidf, train_sentiments['sentiment'])
gb_predictions = gb_model.predict(test_tfidf)
gb_accuracy = accuracy_score(test_sentiments['sentiment'], gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Classification Report for Gradient Boosting:\n", classification_report(test_sentiments['sentiment'], gb_predictions))


# Stacking
stacking_model = StackingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model),('gb', gb_model)],
    final_estimator=GradientBoostingClassifier(n_estimators=100, random_state=42)
)
stacking_model.fit(train_tfidf, train_sentiments['sentiment'])
stacking_predictions = stacking_model.predict(test_tfidf)
stacking_accuracy = accuracy_score(test_sentiments['sentiment'], stacking_predictions)
print("Stacking Accuracy:", stacking_accuracy)
print("Classification Report for Stacking:\n", classification_report(test_sentiments['sentiment'], stacking_predictions))
print("/n")

# Bagging
bagging_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
bagging_model.fit(train_tfidf, train_sentiments['sentiment'])
bagging_predictions = bagging_model.predict(test_tfidf)
bagging_accuracy = accuracy_score(test_sentiments['sentiment'], bagging_predictions)
print("Bagging Accuracy:", bagging_accuracy)
print("Classification Report for Bagging:\n", classification_report(test_sentiments['sentiment'], bagging_predictions))
print("/n")

# Voting Classifier
voting_model = VotingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model), ('gb', gb_model)],
    voting='hard'
)
voting_model.fit(train_tfidf, train_sentiments['sentiment'])
voting_predictions = voting_model.predict(test_tfidf)
voting_accuracy = accuracy_score(test_sentiments['sentiment'], voting_predictions)
print("Voting Classifier Accuracy:", voting_accuracy)
print("Classification Report for Voting Classifier:\n", classification_report(test_sentiments['sentiment'], voting_predictions))


# Neural Network
nn_model = MLPClassifier(hidden_layer_sizes=(64,), max_iter=100, random_state=42)
nn_model.fit(train_tfidf, train_sentiments['sentiment'])
nn_predictions = nn_model.predict(test_tfidf)
nn_accuracy = accuracy_score(test_sentiments['sentiment'], nn_predictions)
print("Neural Network Accuracy:", nn_accuracy)
print("Classification Report for Neural Network:\n", classification_report(test_sentiments['sentiment'], nn_predictions))


Naive Bayes Accuracy: 0.845
Classification Report for Naive Bayes:
               precision    recall  f1-score   support

    negative       0.84      0.84      0.84      9935
    positive       0.85      0.85      0.85     10065

    accuracy                           0.84     20000
   macro avg       0.84      0.84      0.84     20000
weighted avg       0.84      0.84      0.84     20000

['positive' 'positive' 'negative' ... 'negative' 'positive' 'negative']
/n
Random Forest Accuracy: 0.83445
Classification Report for Random Forest:
               precision    recall  f1-score   support

    negative       0.83      0.84      0.83      9935
    positive       0.84      0.83      0.83     10065

    accuracy                           0.83     20000
   macro avg       0.83      0.83      0.83     20000
weighted avg       0.83      0.83      0.83     20000

['negative' 'positive' 'negative' ... 'negative' 'positive' 'negative']
k-NN Accuracy: 0.67225
Classification Report for k-NN:
  



Bagging Accuracy: 0.79065
Classification Report for Bagging:
               precision    recall  f1-score   support

    negative       0.79      0.78      0.79      9935
    positive       0.79      0.80      0.79     10065

    accuracy                           0.79     20000
   macro avg       0.79      0.79      0.79     20000
weighted avg       0.79      0.79      0.79     20000

/n
Voting Classifier Accuracy: 0.8408
Classification Report for Voting Classifier:
               precision    recall  f1-score   support

    negative       0.84      0.84      0.84      9935
    positive       0.84      0.84      0.84     10065

    accuracy                           0.84     20000
   macro avg       0.84      0.84      0.84     20000
weighted avg       0.84      0.84      0.84     20000

Neural Network Accuracy: 0.84925
Classification Report for Neural Network:
               precision    recall  f1-score   support

    negative       0.85      0.84      0.85      9935
    positive   

---------------------------------- LANCASTER STEMMER WITHOUT POS _ COUNT VECTORIZER ---------------------------------

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer 
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
train_sentiments = pd.read_csv("/kaggle/input/sent-analysis/train/train.csv")
test_sentiments = pd.read_csv("/kaggle/input/sent-analysis/test/test.csv")

# Text preprocessing function with stemming and POS tagging
def preprocess_review(review):
    tokens = word_tokenize(review)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Use LancasterStemmer for stemming
    stemmer = LancasterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)

train_sentiments['processed_reviews'] = train_sentiments['review'].apply(preprocess_review)
test_sentiments['processed_reviews'] = test_sentiments['review'].apply(preprocess_review)

# Vectorize the text data using CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)
train_tfidf = count_vectorizer.fit_transform(train_sentiments['processed_reviews'])
test_tfidf = count_vectorizer.transform(test_sentiments['processed_reviews'])


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(train_tfidf, train_sentiments['sentiment'])
nb_predictions = nb_model.predict(test_tfidf)
nb_accuracy = accuracy_score(test_sentiments['sentiment'], nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Classification Report for Naive Bayes:\n", classification_report(test_sentiments['sentiment'], nb_predictions))
print(nb_predictions)
print("/n")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_tfidf, train_sentiments['sentiment'])
rf_predictions = rf_model.predict(test_tfidf)
rf_accuracy = accuracy_score(test_sentiments['sentiment'], rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Classification Report for Random Forest:\n", classification_report(test_sentiments['sentiment'], rf_predictions))
print(rf_predictions)


# Neural Network
nn_model = MLPClassifier(hidden_layer_sizes=(64,), max_iter=100, random_state=42)
nn_model.fit(train_tfidf, train_sentiments['sentiment'])
nn_predictions = nn_model.predict(test_tfidf)
nn_accuracy = accuracy_score(test_sentiments['sentiment'], nn_predictions)
print("Neural Network Accuracy:", nn_accuracy)
print("Classification Report for Neural Network:\n", classification_report(test_sentiments['sentiment'], nn_predictions))

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(train_tfidf, train_sentiments['sentiment'])
dt_predictions = dt_model.predict(test_tfidf)
dt_accuracy = accuracy_score(test_sentiments['sentiment'], dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print("Classification Report for Decision Tree:\n", classification_report(test_sentiments['sentiment'], dt_predictions))
print("/n")

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(train_tfidf, train_sentiments['sentiment'])
gb_predictions = gb_model.predict(test_tfidf)
gb_accuracy = accuracy_score(test_sentiments['sentiment'], gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Classification Report for Gradient Boosting:\n", classification_report(test_sentiments['sentiment'], gb_predictions))

# k-NN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_tfidf, train_sentiments['sentiment'])
knn_predictions = knn_model.predict(test_tfidf)
knn_accuracy = accuracy_score(test_sentiments['sentiment'], knn_predictions)
print("k-NN Accuracy:", knn_accuracy)
print("Classification Report for k-NN:\n", classification_report(test_sentiments['sentiment'], knn_predictions))
print("/n")

# Stacking
stacking_model = StackingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model),('gb', gb_model)],
    final_estimator=GradientBoostingClassifier(n_estimators=100, random_state=42)
)
stacking_model.fit(train_tfidf, train_sentiments['sentiment'])
stacking_predictions = stacking_model.predict(test_tfidf)
stacking_accuracy = accuracy_score(test_sentiments['sentiment'], stacking_predictions)
print("Stacking Accuracy:", stacking_accuracy)
print("Classification Report for Stacking:\n", classification_report(test_sentiments['sentiment'], stacking_predictions))
print("/n")

# Bagging
bagging_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
bagging_model.fit(train_tfidf, train_sentiments['sentiment'])
bagging_predictions = bagging_model.predict(test_tfidf)
bagging_accuracy = accuracy_score(test_sentiments['sentiment'], bagging_predictions)
print("Bagging Accuracy:", bagging_accuracy)
print("Classification Report for Bagging:\n", classification_report(test_sentiments['sentiment'], bagging_predictions))
print("/n")

# Voting Classifier
voting_model = VotingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model), ('gb', gb_model)],
    voting='hard'
)
voting_model.fit(train_tfidf, train_sentiments['sentiment'])
voting_predictions = voting_model.predict(test_tfidf)
voting_accuracy = accuracy_score(test_sentiments['sentiment'], voting_predictions)
print("Voting Classifier Accuracy:", voting_accuracy)
print("Classification Report for Voting Classifier:\n", classification_report(test_sentiments['sentiment'], voting_predictions))



Naive Bayes Accuracy: 0.8427
Classification Report for Naive Bayes:
               precision    recall  f1-score   support

    negative       0.83      0.85      0.84      9935
    positive       0.85      0.83      0.84     10065

    accuracy                           0.84     20000
   macro avg       0.84      0.84      0.84     20000
weighted avg       0.84      0.84      0.84     20000

['positive' 'positive' 'negative' ... 'negative' 'positive' 'negative']
/n
Random Forest Accuracy: 0.8427
Classification Report for Random Forest:
               precision    recall  f1-score   support

    negative       0.84      0.85      0.84      9935
    positive       0.85      0.84      0.84     10065

    accuracy                           0.84     20000
   macro avg       0.84      0.84      0.84     20000
weighted avg       0.84      0.84      0.84     20000

['negative' 'positive' 'negative' ... 'negative' 'positive' 'negative']
Neural Network Accuracy: 0.86675
Classification Report fo



Bagging Accuracy: 0.80275
Classification Report for Bagging:
               precision    recall  f1-score   support

    negative       0.81      0.79      0.80      9935
    positive       0.80      0.82      0.81     10065

    accuracy                           0.80     20000
   macro avg       0.80      0.80      0.80     20000
weighted avg       0.80      0.80      0.80     20000

/n
Voting Classifier Accuracy: 0.84365
Classification Report for Voting Classifier:
               precision    recall  f1-score   support

    negative       0.84      0.84      0.84      9935
    positive       0.84      0.85      0.85     10065

    accuracy                           0.84     20000
   macro avg       0.84      0.84      0.84     20000
weighted avg       0.84      0.84      0.84     20000



---------------------------------- LANCASTER STEMMER WITHOUT POS _ COUNT VECTORIZER with modified parameters of ml models ---------------------------------


In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer 
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
train_sentiments = pd.read_csv("/kaggle/input/sent-analysis/train/train.csv")
test_sentiments = pd.read_csv("/kaggle/input/sent-analysis/test/test.csv")

# Text preprocessing function with stemming and POS tagging
def preprocess_review(review):
    tokens = word_tokenize(review)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Use LancasterStemmer for stemming
    stemmer = LancasterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)

train_sentiments['processed_reviews'] = train_sentiments['review'].apply(preprocess_review)
test_sentiments['processed_reviews'] = test_sentiments['review'].apply(preprocess_review)

# Vectorize the text data using CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)
train_tfidf = count_vectorizer.fit_transform(train_sentiments['processed_reviews'])
test_tfidf = count_vectorizer.transform(test_sentiments['processed_reviews'])


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier

# Naive Bayes
nb_model = MultinomialNB(alpha=0.1) 
nb_model.fit(train_tfidf, train_sentiments['sentiment'])
nb_predictions = nb_model.predict(test_tfidf)
nb_accuracy = accuracy_score(test_sentiments['sentiment'], nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Classification Report for Naive Bayes:\n", classification_report(test_sentiments['sentiment'], nb_predictions))
print("\n")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42) 
rf_model.fit(train_tfidf, train_sentiments['sentiment'])
rf_predictions = rf_model.predict(test_tfidf)
rf_accuracy = accuracy_score(test_sentiments['sentiment'], rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Classification Report for Random Forest:\n", classification_report(test_sentiments['sentiment'], rf_predictions))
print("\n")

# Neural Network
nn_model =  MLPClassifier(hidden_layer_sizes=(128,), max_iter=200, alpha=0.0001, random_state=42)
nn_model.fit(train_tfidf, train_sentiments['sentiment'])
nn_predictions = nn_model.predict(test_tfidf)
nn_accuracy = accuracy_score(test_sentiments['sentiment'], nn_predictions)
print("Neural Network MLP CLASSIFIER Accuracy:", nn_accuracy)
print("Classification Report for Neural Network:\n", classification_report(test_sentiments['sentiment'], nn_predictions))
print("\n")

# Decision Tree
dt_model =  DecisionTreeClassifier(max_depth=10, min_samples_split=10, random_state=42)
dt_model.fit(train_tfidf, train_sentiments['sentiment'])
dt_predictions = dt_model.predict(test_tfidf)
dt_accuracy = accuracy_score(test_sentiments['sentiment'], dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print("Classification Report for Decision Tree:\n", classification_report(test_sentiments['sentiment'], dt_predictions))
print("\n")

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
gb_model.fit(train_tfidf, train_sentiments['sentiment'])
gb_predictions = gb_model.predict(test_tfidf)
gb_accuracy = accuracy_score(test_sentiments['sentiment'], gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Classification Report for Gradient Boosting:\n", classification_report(test_sentiments['sentiment'], gb_predictions))
print("\n")

# k-NN
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(train_tfidf, train_sentiments['sentiment'])
knn_predictions = knn_model.predict(test_tfidf)
knn_accuracy = accuracy_score(test_sentiments['sentiment'], knn_predictions)
print("k-NN Accuracy:", knn_accuracy)
print("Classification Report for k-NN:\n", classification_report(test_sentiments['sentiment'], knn_predictions))
print("\n")

# Stacking
stacking_model = StackingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model),('gb', gb_model)],
    final_estimator=GradientBoostingClassifier(n_estimators=100, random_state=42)
)
stacking_model.fit(train_tfidf, train_sentiments['sentiment'])
stacking_predictions = stacking_model.predict(test_tfidf)
stacking_accuracy = accuracy_score(test_sentiments['sentiment'], stacking_predictions)
print("Stacking Accuracy:", stacking_accuracy)
print("Classification Report for Stacking:\n", classification_report(test_sentiments['sentiment'], stacking_predictions))
print("\n")

# Bagging
bagging_model = BaggingClassifier(estimator= ExtraTreesClassifier(), n_estimators=200, random_state=42)
bagging_model.fit(train_tfidf, train_sentiments['sentiment'])
bagging_predictions = bagging_model.predict(test_tfidf)
bagging_accuracy = accuracy_score(test_sentiments['sentiment'], bagging_predictions)
print("Bagging Accuracy:", bagging_accuracy)
print("Classification Report for Bagging:\n", classification_report(test_sentiments['sentiment'], bagging_predictions))
print("\n")

# Voting Classifier
voting_model = VotingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model), ('gb', gb_model), ('st', stacking_model)],
    voting='hard'
)
voting_model.fit(train_tfidf, train_sentiments['sentiment'])
voting_predictions = voting_model.predict(test_tfidf)
voting_accuracy = accuracy_score(test_sentiments['sentiment'], voting_predictions)
print("Voting Classifier Accuracy:", voting_accuracy)
print("Classification Report for Voting Classifier:\n", classification_report(test_sentiments['sentiment'], voting_predictions))

2024-02-11 06:50:06.656994: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-11 06:50:06.657186: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-11 06:50:06.854562: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Naive Bayes Accuracy: 0.84295
Classification Report for Naive Bayes:
               precision    recall  f1-score   support

    negative       0.84      0.85      0.84      9935
    positive       0.85      0.83      0.84     10065

    accuracy                           0.84     20000
   macro avg       0.84      0.84      0.84     20000
weighted avg       0.84      0.84      0.84     20000



Random Forest Accuracy: 0.8277
Classification Report for Random Forest:
               precision    recall  f1-score   support

    negative       0.86      0.78      0.82      9935
    positive       0.80      0.87      0.84     10065

    accuracy                           0.83     20000
   macro avg       0.83      0.83      0.83     20000
weighted avg       0.83      0.83      0.83     20000



Neural Network MLP CLASSIFIER Accuracy: 0.8717
Classification Report for Neural Network:
               precision    recall  f1-score   support

    negative       0.88      0.86      0.87      9935


LANCASTER STEMMER WITH POS WITH COUNT VECTORIZER

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer 
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
train_sentiments = pd.read_csv("/kaggle/input/sent-analysis/train/train.csv")
test_sentiments = pd.read_csv("/kaggle/input/sent-analysis/test/test.csv")

# Text preprocessing function with stemming and POS tagging
def preprocess_review(review):
    tokens = word_tokenize(review)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Use LancasterStemmer for stemming
    stemmer = LancasterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Perform POS tagging
    pos_tags = pos_tag(tokens)
    tokens = [word + '_' + pos for word, pos in pos_tags]

    return ' '.join(tokens)

train_sentiments['processed_reviews'] = train_sentiments['review'].apply(preprocess_review)
test_sentiments['processed_reviews'] = test_sentiments['review'].apply(preprocess_review)

# Vectorize the text data using CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)
train_tfidf = count_vectorizer.fit_transform(train_sentiments['processed_reviews'])
test_tfidf = count_vectorizer.transform(test_sentiments['processed_reviews'])


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [5]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(train_tfidf, train_sentiments['sentiment'])
nb_predictions = nb_model.predict(test_tfidf)
nb_accuracy = accuracy_score(test_sentiments['sentiment'], nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Classification Report for Naive Bayes:\n", classification_report(test_sentiments['sentiment'], nb_predictions))
print(nb_predictions)
print("/n")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_tfidf, train_sentiments['sentiment'])
rf_predictions = rf_model.predict(test_tfidf)
rf_accuracy = accuracy_score(test_sentiments['sentiment'], rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Classification Report for Random Forest:\n", classification_report(test_sentiments['sentiment'], rf_predictions))
print(rf_predictions)


# Neural Network
nn_model = MLPClassifier(hidden_layer_sizes=(64,), max_iter=100, random_state=42)
nn_model.fit(train_tfidf, train_sentiments['sentiment'])
nn_predictions = nn_model.predict(test_tfidf)
nn_accuracy = accuracy_score(test_sentiments['sentiment'], nn_predictions)
print("Neural Network Accuracy:", nn_accuracy)
print("Classification Report for Neural Network:\n", classification_report(test_sentiments['sentiment'], nn_predictions))

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(train_tfidf, train_sentiments['sentiment'])
dt_predictions = dt_model.predict(test_tfidf)
dt_accuracy = accuracy_score(test_sentiments['sentiment'], dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print("Classification Report for Decision Tree:\n", classification_report(test_sentiments['sentiment'], dt_predictions))
print("/n")

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(train_tfidf, train_sentiments['sentiment'])
gb_predictions = gb_model.predict(test_tfidf)
gb_accuracy = accuracy_score(test_sentiments['sentiment'], gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Classification Report for Gradient Boosting:\n", classification_report(test_sentiments['sentiment'], gb_predictions))

# k-NN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_tfidf, train_sentiments['sentiment'])
knn_predictions = knn_model.predict(test_tfidf)
knn_accuracy = accuracy_score(test_sentiments['sentiment'], knn_predictions)
print("k-NN Accuracy:", knn_accuracy)
print("Classification Report for k-NN:\n", classification_report(test_sentiments['sentiment'], knn_predictions))
print("/n")

# Stacking
stacking_model = StackingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model),('gb', gb_model)],
    final_estimator=GradientBoostingClassifier(n_estimators=100, random_state=42)
)
stacking_model.fit(train_tfidf, train_sentiments['sentiment'])
stacking_predictions = stacking_model.predict(test_tfidf)
stacking_accuracy = accuracy_score(test_sentiments['sentiment'], stacking_predictions)
print("Stacking Accuracy:", stacking_accuracy)
print("Classification Report for Stacking:\n", classification_report(test_sentiments['sentiment'], stacking_predictions))
print("/n")

# Bagging
bagging_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
bagging_model.fit(train_tfidf, train_sentiments['sentiment'])
bagging_predictions = bagging_model.predict(test_tfidf)
bagging_accuracy = accuracy_score(test_sentiments['sentiment'], bagging_predictions)
print("Bagging Accuracy:", bagging_accuracy)
print("Classification Report for Bagging:\n", classification_report(test_sentiments['sentiment'], bagging_predictions))
print("/n")

# Voting Classifier
voting_model = VotingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model), ('gb', gb_model)],
    voting='hard'
)
voting_model.fit(train_tfidf, train_sentiments['sentiment'])
voting_predictions = voting_model.predict(test_tfidf)
voting_accuracy = accuracy_score(test_sentiments['sentiment'], voting_predictions)
print("Voting Classifier Accuracy:", voting_accuracy)
print("Classification Report for Voting Classifier:\n", classification_report(test_sentiments['sentiment'], voting_predictions))



Naive Bayes Accuracy: 0.8342
Classification Report for Naive Bayes:
               precision    recall  f1-score   support

    negative       0.83      0.84      0.83      9935
    positive       0.84      0.83      0.83     10065

    accuracy                           0.83     20000
   macro avg       0.83      0.83      0.83     20000
weighted avg       0.83      0.83      0.83     20000

['positive' 'positive' 'negative' ... 'negative' 'positive' 'negative']
/n
Random Forest Accuracy: 0.8292
Classification Report for Random Forest:
               precision    recall  f1-score   support

    negative       0.83      0.83      0.83      9935
    positive       0.83      0.83      0.83     10065

    accuracy                           0.83     20000
   macro avg       0.83      0.83      0.83     20000
weighted avg       0.83      0.83      0.83     20000

['negative' 'positive' 'negative' ... 'negative' 'positive' 'negative']
Neural Network Accuracy: 0.85775
Classification Report fo



Bagging Accuracy: 0.79035
Classification Report for Bagging:
               precision    recall  f1-score   support

    negative       0.79      0.78      0.79      9935
    positive       0.79      0.80      0.79     10065

    accuracy                           0.79     20000
   macro avg       0.79      0.79      0.79     20000
weighted avg       0.79      0.79      0.79     20000

/n
Voting Classifier Accuracy: 0.82985
Classification Report for Voting Classifier:
               precision    recall  f1-score   support

    negative       0.83      0.82      0.83      9935
    positive       0.83      0.84      0.83     10065

    accuracy                           0.83     20000
   macro avg       0.83      0.83      0.83     20000
weighted avg       0.83      0.83      0.83     20000



------------------ SNOWBALL STEMMER WITHOUT POS - TF-IDF VECTORIZATION WITH ML MODELS -------------------------------

In [7]:
from nltk.stem import SnowballStemmer 
train_sentiments = pd.read_csv("/kaggle/input/sent-analysis/train/train.csv")
test_sentiments = pd.read_csv("/kaggle/input/sent-analysis/test/test.csv")

# Text preprocessing function with stemming and POS tagging
def preprocess_sreview(review):
    tokens = word_tokenize(review)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Use SnowballStemmer for stemming
    stemmer = SnowballStemmer("english")
    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)

train_sentiments['processed_reviews'] = train_sentiments['review'].apply(preprocess_sreview)
test_sentiments['processed_reviews'] = test_sentiments['review'].apply(preprocess_sreview)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_tfidf = tfidf_vectorizer.fit_transform(train_sentiments['processed_reviews'])
test_tfidf = tfidf_vectorizer.transform(test_sentiments['processed_reviews'] )

In [None]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(train_tfidf, train_sentiments['sentiment'])
nb_predictions = nb_model.predict(test_tfidf)
nb_accuracy = accuracy_score(test_sentiments['sentiment'], nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Classification Report for Naive Bayes:\n", classification_report(test_sentiments['sentiment'], nb_predictions))
print(nb_predictions)
print("/n")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_tfidf, train_sentiments['sentiment'])
rf_predictions = rf_model.predict(test_tfidf)
rf_accuracy = accuracy_score(test_sentiments['sentiment'], rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Classification Report for Random Forest:\n", classification_report(test_sentiments['sentiment'], rf_predictions))
print(rf_predictions)

# k-NN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_tfidf, train_sentiments['sentiment'])
knn_predictions = knn_model.predict(test_tfidf)
knn_accuracy = accuracy_score(test_sentiments['sentiment'], knn_predictions)
print("k-NN Accuracy:", knn_accuracy)
print("Classification Report for k-NN:\n", classification_report(test_sentiments['sentiment'], knn_predictions))
print("/n")

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(train_tfidf, train_sentiments['sentiment'])
dt_predictions = dt_model.predict(test_tfidf)
dt_accuracy = accuracy_score(test_sentiments['sentiment'], dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print("Classification Report for Decision Tree:\n", classification_report(test_sentiments['sentiment'], dt_predictions))
print("/n")

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(train_tfidf, train_sentiments['sentiment'])
gb_predictions = gb_model.predict(test_tfidf)
gb_accuracy = accuracy_score(test_sentiments['sentiment'], gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Classification Report for Gradient Boosting:\n", classification_report(test_sentiments['sentiment'], gb_predictions))



Naive Bayes Accuracy: 0.85145
Classification Report for Naive Bayes:
               precision    recall  f1-score   support

    negative       0.85      0.85      0.85      9935
    positive       0.85      0.85      0.85     10065

    accuracy                           0.85     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.85      0.85      0.85     20000

['positive' 'positive' 'negative' ... 'negative' 'positive' 'negative']
/n
Random Forest Accuracy: 0.8484
Classification Report for Random Forest:
               precision    recall  f1-score   support

    negative       0.84      0.86      0.85      9935
    positive       0.86      0.84      0.85     10065

    accuracy                           0.85     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.85      0.85      0.85     20000

['negative' 'positive' 'negative' ... 'negative' 'positive' 'negative']
k-NN Accuracy: 0.74325
Classification Report for k-NN:
 

In [1]:
from nltk.stem import SnowballStemmer 
import nltk
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer 
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
train_sentiments = pd.read_csv("/kaggle/input/sent-analysis/train/train.csv")
test_sentiments = pd.read_csv("/kaggle/input/sent-analysis/test/test.csv")

# Text preprocessing function with stemming and POS tagging
def preprocess_sreview(review):
    tokens = word_tokenize(review)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Use SnowballStemmer for stemming
    stemmer = SnowballStemmer("english")
    tokens = [stemmer.stem(token) for token in tokens]

    return ' '.join(tokens)

train_sentiments['processed_reviews'] = train_sentiments['review'].apply(preprocess_sreview)
test_sentiments['processed_reviews'] = test_sentiments['review'].apply(preprocess_sreview)
# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_tfidf = tfidf_vectorizer.fit_transform(train_sentiments['processed_reviews'])
test_tfidf = tfidf_vectorizer.transform(test_sentiments['processed_reviews'] )

# Neural Network
nn_model = MLPClassifier(hidden_layer_sizes=(64,), max_iter=100, random_state=42)
nn_model.fit(train_tfidf, train_sentiments['sentiment'])
nn_predictions = nn_model.predict(test_tfidf)
nn_accuracy = accuracy_score(test_sentiments['sentiment'], nn_predictions)
print("Neural Network Accuracy:", nn_accuracy)
print("Classification Report for Neural Network:\n", classification_report(test_sentiments['sentiment'], nn_predictions))


dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(train_tfidf, train_sentiments['sentiment'])
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_tfidf, train_sentiments['sentiment'])
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(train_tfidf, train_sentiments['sentiment'])
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_tfidf, train_sentiments['sentiment'])
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(train_tfidf, train_sentiments['sentiment'])

# Stacking
stacking_model = StackingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model),('gb', gb_model)],
    final_estimator=GradientBoostingClassifier(n_estimators=100, random_state=42)
)
stacking_model.fit(train_tfidf, train_sentiments['sentiment'])
stacking_predictions = stacking_model.predict(test_tfidf)
stacking_accuracy = accuracy_score(test_sentiments['sentiment'], stacking_predictions)
print("Stacking Accuracy:", stacking_accuracy)
print("Classification Report for Stacking:\n", classification_report(test_sentiments['sentiment'], stacking_predictions))
print("/n")

# Bagging
bagging_model = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
bagging_model.fit(train_tfidf, train_sentiments['sentiment'])
bagging_predictions = bagging_model.predict(test_tfidf)
bagging_accuracy = accuracy_score(test_sentiments['sentiment'], bagging_predictions)
print("Bagging Accuracy:", bagging_accuracy)
print("Classification Report for Bagging:\n", classification_report(test_sentiments['sentiment'], bagging_predictions))
print("/n")

# Voting Classifier
voting_model = VotingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model), ('gb', gb_model)],
    voting='hard'
)
voting_model.fit(train_tfidf, train_sentiments['sentiment'])
voting_predictions = voting_model.predict(test_tfidf)
voting_accuracy = accuracy_score(test_sentiments['sentiment'], voting_predictions)
print("Voting Classifier Accuracy:", voting_accuracy)
print("Classification Report for Voting Classifier:\n", classification_report(test_sentiments['sentiment'], voting_predictions))



[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Neural Network Accuracy: 0.8579
Classification Report for Neural Network:
               precision    recall  f1-score   support

    negative       0.86      0.85      0.86      9935
    positive       0.86      0.86      0.86     10065

    accuracy                           0.86     20000
   macro avg       0.86      0.86      0.86     20000
weighted avg       0.86      0.86      0.86     20000

Stacking Accuracy: 0.87435
Classification Report for Stacking:
               precision    recall  f1-score   support

    negative       0.87      0.87      0.87   



Bagging Accuracy: 0.8092
Classification Report for Bagging:
               precision    recall  f1-score   support

    negative       0.81      0.81      0.81      9935
    positive       0.81      0.81      0.81     10065

    accuracy                           0.81     20000
   macro avg       0.81      0.81      0.81     20000
weighted avg       0.81      0.81      0.81     20000

/n
Voting Classifier Accuracy: 0.8566
Classification Report for Voting Classifier:
               precision    recall  f1-score   support

    negative       0.87      0.84      0.85      9935
    positive       0.85      0.87      0.86     10065

    accuracy                           0.86     20000
   macro avg       0.86      0.86      0.86     20000
weighted avg       0.86      0.86      0.86     20000



---------------------------------------- SPACY LEMMATIZER without POS _ TF-IDF VECTORIZER WITH ML MODELS --------------------------

In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m54.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import pandas as pd
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the English language model in SpaCy
nlp = spacy.load("en_core_web_sm")

# Function for lemmatization
def preprocess_lemreview(review):
    # Tokenize the review
    tokens = word_tokenize(review)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Lemmatize using SpaCy
    doc = nlp(" ".join(tokens))
    tokens = [token.lemma_ for token in doc]

    return ' '.join(tokens)

# Read the CSV files
train_sentiments = pd.read_csv("/kaggle/input/sent-analysis/train/train.csv")
test_sentiments = pd.read_csv("/kaggle/input/sent-analysis/test/test.csv")

# Apply preprocessing to the reviews
train_sentiments['processed_reviews'] = train_sentiments['review'].apply(preprocess_lemreview)
test_sentiments['processed_reviews'] = test_sentiments['review'].apply(preprocess_lemreview)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_tfidf = tfidf_vectorizer.fit_transform(train_sentiments['processed_reviews'])
test_tfidf = tfidf_vectorizer.transform(test_sentiments['processed_reviews'])

In [3]:
from keras.models import Sequential
from keras.layers import Dense
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(train_tfidf, train_sentiments['sentiment'])
nb_predictions = nb_model.predict(test_tfidf)
nb_accuracy = accuracy_score(test_sentiments['sentiment'], nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Classification Report for Naive Bayes:\n", classification_report(test_sentiments['sentiment'], nb_predictions))
print("\n")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_tfidf, train_sentiments['sentiment'])
rf_predictions = rf_model.predict(test_tfidf)
rf_accuracy = accuracy_score(test_sentiments['sentiment'], rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Classification Report for Random Forest:\n", classification_report(test_sentiments['sentiment'], rf_predictions))
print("\n")

# Neural Network
nn_model = MLPClassifier(hidden_layer_sizes=(64,), max_iter=100, random_state=42)
nn_model.fit(train_tfidf, train_sentiments['sentiment'])
nn_predictions = nn_model.predict(test_tfidf)
nn_accuracy = accuracy_score(test_sentiments['sentiment'], nn_predictions)
print("Neural Network MLP CLASSIFIER Accuracy:", nn_accuracy)
print("Classification Report for Neural Network:\n", classification_report(test_sentiments['sentiment'], nn_predictions))
print("\n")

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(train_tfidf, train_sentiments['sentiment'])
dt_predictions = dt_model.predict(test_tfidf)
dt_accuracy = accuracy_score(test_sentiments['sentiment'], dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print("Classification Report for Decision Tree:\n", classification_report(test_sentiments['sentiment'], dt_predictions))
print("\n")

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(train_tfidf, train_sentiments['sentiment'])
gb_predictions = gb_model.predict(test_tfidf)
gb_accuracy = accuracy_score(test_sentiments['sentiment'], gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Classification Report for Gradient Boosting:\n", classification_report(test_sentiments['sentiment'], gb_predictions))
print("\n")

# k-NN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(train_tfidf, train_sentiments['sentiment'])
knn_predictions = knn_model.predict(test_tfidf)
knn_accuracy = accuracy_score(test_sentiments['sentiment'], knn_predictions)
print("k-NN Accuracy:", knn_accuracy)
print("Classification Report for k-NN:\n", classification_report(test_sentiments['sentiment'], knn_predictions))
print("\n")

# Stacking
stacking_model = StackingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model),('gb', gb_model)],
    final_estimator=GradientBoostingClassifier(n_estimators=100, random_state=42)
)
stacking_model.fit(train_tfidf, train_sentiments['sentiment'])
stacking_predictions = stacking_model.predict(test_tfidf)
stacking_accuracy = accuracy_score(test_sentiments['sentiment'], stacking_predictions)
print("Stacking Accuracy:", stacking_accuracy)
print("Classification Report for Stacking:\n", classification_report(test_sentiments['sentiment'], stacking_predictions))
print("\n")

# Bagging
bagging_model = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=100, random_state=42)
bagging_model.fit(train_tfidf, train_sentiments['sentiment'])
bagging_predictions = bagging_model.predict(test_tfidf)
bagging_accuracy = accuracy_score(test_sentiments['sentiment'], bagging_predictions)
print("Bagging Accuracy:", bagging_accuracy)
print("Classification Report for Bagging:\n", classification_report(test_sentiments['sentiment'], bagging_predictions))
print("\n")

# Voting Classifier
voting_model = VotingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model), ('gb', gb_model)],
    voting='hard'
)
voting_model.fit(train_tfidf, train_sentiments['sentiment'])
voting_predictions = voting_model.predict(test_tfidf)
voting_accuracy = accuracy_score(test_sentiments['sentiment'], voting_predictions)
print("Voting Classifier Accuracy:", voting_accuracy)
print("Classification Report for Voting Classifier:\n", classification_report(test_sentiments['sentiment'], voting_predictions))

2024-02-08 10:05:25.838282: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-08 10:05:25.838569: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-08 10:05:25.987635: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Naive Bayes Accuracy: 0.85375
Classification Report for Naive Bayes:
               precision    recall  f1-score   support

    negative       0.85      0.85      0.85      9935
    positive       0.85      0.85      0.85     10065

    accuracy                           0.85     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.85      0.85      0.85     20000



Random Forest Accuracy: 0.8441
Classification Report for Random Forest:
               precision    recall  f1-score   support

    negative       0.83      0.86      0.85      9935
    positive       0.85      0.83      0.84     10065

    accuracy                           0.84     20000
   macro avg       0.84      0.84      0.84     20000
weighted avg       0.84      0.84      0.84     20000



Neural Network MLP CLASSIFIER Accuracy: 0.858
Classification Report for Neural Network:
               precision    recall  f1-score   support

    negative       0.86      0.86      0.86      9935
 

spacy lemmatizer without pos _ tf -idf vectorizer with modified parameters of ML modela

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
import pandas as pd
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the English language model in SpaCy
nlp = spacy.load("en_core_web_sm")

# Function for lemmatization
def preprocess_lemreview(review):
    # Tokenize the review
    tokens = word_tokenize(review)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Lemmatize using SpaCy
    doc = nlp(" ".join(tokens))
    tokens = [token.lemma_ for token in doc]

    return ' '.join(tokens)

# Read the CSV files
train_sentiments = pd.read_csv("/kaggle/input/sent-analysis/train/train.csv")
test_sentiments = pd.read_csv("/kaggle/input/sent-analysis/test/test.csv")

# Apply preprocessing to the reviews
train_sentiments['processed_reviews'] = train_sentiments['review'].apply(preprocess_lemreview)
test_sentiments['processed_reviews'] = test_sentiments['review'].apply(preprocess_lemreview)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_tfidf = tfidf_vectorizer.fit_transform(train_sentiments['processed_reviews'])
test_tfidf = tfidf_vectorizer.transform(test_sentiments['processed_reviews'])

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier

# Naive Bayes
nb_model = MultinomialNB(alpha=0.1) 
nb_model.fit(train_tfidf, train_sentiments['sentiment'])
nb_predictions = nb_model.predict(test_tfidf)
nb_accuracy = accuracy_score(test_sentiments['sentiment'], nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Classification Report for Naive Bayes:\n", classification_report(test_sentiments['sentiment'], nb_predictions))
print("\n")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42) 
rf_model.fit(train_tfidf, train_sentiments['sentiment'])
rf_predictions = rf_model.predict(test_tfidf)
rf_accuracy = accuracy_score(test_sentiments['sentiment'], rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Classification Report for Random Forest:\n", classification_report(test_sentiments['sentiment'], rf_predictions))
print("\n")

# Neural Network
nn_model =  MLPClassifier(hidden_layer_sizes=(128,), max_iter=200, alpha=0.0001, random_state=42)
nn_model.fit(train_tfidf, train_sentiments['sentiment'])
nn_predictions = nn_model.predict(test_tfidf)
nn_accuracy = accuracy_score(test_sentiments['sentiment'], nn_predictions)
print("Neural Network MLP CLASSIFIER Accuracy:", nn_accuracy)
print("Classification Report for Neural Network:\n", classification_report(test_sentiments['sentiment'], nn_predictions))
print("\n")

# Decision Tree
dt_model =  DecisionTreeClassifier(max_depth=10, min_samples_split=10, random_state=42)
dt_model.fit(train_tfidf, train_sentiments['sentiment'])
dt_predictions = dt_model.predict(test_tfidf)
dt_accuracy = accuracy_score(test_sentiments['sentiment'], dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print("Classification Report for Decision Tree:\n", classification_report(test_sentiments['sentiment'], dt_predictions))
print("\n")

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
gb_model.fit(train_tfidf, train_sentiments['sentiment'])
gb_predictions = gb_model.predict(test_tfidf)
gb_accuracy = accuracy_score(test_sentiments['sentiment'], gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Classification Report for Gradient Boosting:\n", classification_report(test_sentiments['sentiment'], gb_predictions))
print("\n")

# k-NN
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(train_tfidf, train_sentiments['sentiment'])
knn_predictions = knn_model.predict(test_tfidf)
knn_accuracy = accuracy_score(test_sentiments['sentiment'], knn_predictions)
print("k-NN Accuracy:", knn_accuracy)
print("Classification Report for k-NN:\n", classification_report(test_sentiments['sentiment'], knn_predictions))
print("\n")

# Stacking
stacking_model = StackingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model),('gb', gb_model)],
    final_estimator=GradientBoostingClassifier(n_estimators=100, random_state=42)
)
stacking_model.fit(train_tfidf, train_sentiments['sentiment'])
stacking_predictions = stacking_model.predict(test_tfidf)
stacking_accuracy = accuracy_score(test_sentiments['sentiment'], stacking_predictions)
print("Stacking Accuracy:", stacking_accuracy)
print("Classification Report for Stacking:\n", classification_report(test_sentiments['sentiment'], stacking_predictions))
print("\n")

# Bagging
bagging_model = BaggingClassifier(estimator= ExtraTreesClassifier(), n_estimators=200, random_state=42)
bagging_model.fit(train_tfidf, train_sentiments['sentiment'])
bagging_predictions = bagging_model.predict(test_tfidf)
bagging_accuracy = accuracy_score(test_sentiments['sentiment'], bagging_predictions)
print("Bagging Accuracy:", bagging_accuracy)
print("Classification Report for Bagging:\n", classification_report(test_sentiments['sentiment'], bagging_predictions))
print("\n")

# Voting Classifier
voting_model = VotingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model), ('gb', gb_model), ('st', stacking_model)],
    voting='hard'
)
voting_model.fit(train_tfidf, train_sentiments['sentiment'])
voting_predictions = voting_model.predict(test_tfidf)
voting_accuracy = accuracy_score(test_sentiments['sentiment'], voting_predictions)
print("Voting Classifier Accuracy:", voting_accuracy)
print("Classification Report for Voting Classifier:\n", classification_report(test_sentiments['sentiment'], voting_predictions))

Naive Bayes Accuracy: 0.8535
Classification Report for Naive Bayes:
               precision    recall  f1-score   support

    negative       0.85      0.85      0.85      9935
    positive       0.85      0.86      0.85     10065

    accuracy                           0.85     20000
   macro avg       0.85      0.85      0.85     20000
weighted avg       0.85      0.85      0.85     20000



Random Forest Accuracy: 0.82955
Classification Report for Random Forest:
               precision    recall  f1-score   support

    negative       0.85      0.80      0.82      9935
    positive       0.81      0.86      0.84     10065

    accuracy                           0.83     20000
   macro avg       0.83      0.83      0.83     20000
weighted avg       0.83      0.83      0.83     20000



Neural Network MLP CLASSIFIER Accuracy: 0.8603
Classification Report for Neural Network:
               precision    recall  f1-score   support

    negative       0.86      0.86      0.86      9935


porter stemmer with POS, tf-idf vectorizer, normalization (Min Max), improved parameters for ML model

In [None]:
import nltk
from nltk.corpus import stopwords
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier, BaggingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

train_sentiments = pd.read_csv("/kaggle/input/sent-analysis/train/train.csv")
test_sentiments = pd.read_csv("/kaggle/input/sent-analysis/test/test.csv")

# Text preprocessing function with stemming and POS tagging
def preprocess_review(review):
    tokens = word_tokenize(review)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Use PorterStemmer for stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Perform POS tagging
    pos_tags = pos_tag(tokens)
    tokens = [word + '_' + pos for word, pos in pos_tags]

    return ' '.join(tokens)

train_sentiments['processed_reviews'] = train_sentiments['review'].apply(preprocess_review)
test_sentiments['processed_reviews'] = test_sentiments['review'].apply(preprocess_review)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_tfidf = tfidf_vectorizer.fit_transform(train_sentiments['processed_reviews'])
test_tfidf = tfidf_vectorizer.transform(test_sentiments['processed_reviews'] )

# Perform Min-Max scaling for normalization
scaler = MinMaxScaler()
train_tfidf = scaler.fit_transform(train_tfidf.toarray())
test_tfidf = scaler.transform(test_tfidf.toarray())

# You can use train_tfidf_normalized and test_tfidf_normalized for further processing


In [None]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier

# Naive Bayes
nb_model = MultinomialNB(alpha=0.1) 
nb_model.fit(train_tfidf, train_sentiments['sentiment'])
nb_predictions = nb_model.predict(test_tfidf)
nb_accuracy = accuracy_score(test_sentiments['sentiment'], nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Classification Report for Naive Bayes:\n", classification_report(test_sentiments['sentiment'], nb_predictions))
print("\n")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42) 
rf_model.fit(train_tfidf, train_sentiments['sentiment'])
rf_predictions = rf_model.predict(test_tfidf)
rf_accuracy = accuracy_score(test_sentiments['sentiment'], rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Classification Report for Random Forest:\n", classification_report(test_sentiments['sentiment'], rf_predictions))
print("\n")

# Neural Network
nn_model =  MLPClassifier(hidden_layer_sizes=(128,), max_iter=200, alpha=0.0001, random_state=42)
nn_model.fit(train_tfidf, train_sentiments['sentiment'])
nn_predictions = nn_model.predict(test_tfidf)
nn_accuracy = accuracy_score(test_sentiments['sentiment'], nn_predictions)
print("Neural Network MLP CLASSIFIER Accuracy:", nn_accuracy)
print("Classification Report for Neural Network:\n", classification_report(test_sentiments['sentiment'], nn_predictions))
print("\n")

# Decision Tree
dt_model =  DecisionTreeClassifier(max_depth=10, min_samples_split=10, random_state=42)
dt_model.fit(train_tfidf, train_sentiments['sentiment'])
dt_predictions = dt_model.predict(test_tfidf)
dt_accuracy = accuracy_score(test_sentiments['sentiment'], dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print("Classification Report for Decision Tree:\n", classification_report(test_sentiments['sentiment'], dt_predictions))
print("\n")

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
gb_model.fit(train_tfidf, train_sentiments['sentiment'])
gb_predictions = gb_model.predict(test_tfidf)
gb_accuracy = accuracy_score(test_sentiments['sentiment'], gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Classification Report for Gradient Boosting:\n", classification_report(test_sentiments['sentiment'], gb_predictions))
print("\n")

# k-NN
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(train_tfidf, train_sentiments['sentiment'])
knn_predictions = knn_model.predict(test_tfidf)
knn_accuracy = accuracy_score(test_sentiments['sentiment'], knn_predictions)
print("k-NN Accuracy:", knn_accuracy)
print("Classification Report for k-NN:\n", classification_report(test_sentiments['sentiment'], knn_predictions))
print("\n")

# Stacking
stacking_model = StackingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model),('gb', gb_model)],
    final_estimator=GradientBoostingClassifier(n_estimators=100, random_state=42)
)
stacking_model.fit(train_tfidf, train_sentiments['sentiment'])
stacking_predictions = stacking_model.predict(test_tfidf)
stacking_accuracy = accuracy_score(test_sentiments['sentiment'], stacking_predictions)
print("Stacking Accuracy:", stacking_accuracy)
print("Classification Report for Stacking:\n", classification_report(test_sentiments['sentiment'], stacking_predictions))
print("\n")

# Bagging
bagging_model = BaggingClassifier(estimator= ExtraTreesClassifier(), n_estimators=200, random_state=42)
bagging_model.fit(train_tfidf, train_sentiments['sentiment'])
bagging_predictions = bagging_model.predict(test_tfidf)
bagging_accuracy = accuracy_score(test_sentiments['sentiment'], bagging_predictions)
print("Bagging Accuracy:", bagging_accuracy)
print("Classification Report for Bagging:\n", classification_report(test_sentiments['sentiment'], bagging_predictions))
print("\n")

# Voting Classifier
voting_model = VotingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model), ('gb', gb_model), ('st', stacking_model)],
    voting='hard'
)
voting_model.fit(train_tfidf, train_sentiments['sentiment'])
voting_predictions = voting_model.predict(test_tfidf)
voting_accuracy = accuracy_score(test_sentiments['sentiment'], voting_predictions)
print("Voting Classifier Accuracy:", voting_accuracy)
print("Classification Report for Voting Classifier:\n", classification_report(test_sentiments['sentiment'], voting_predictions))

porter stemmer with POS, tf-idf vectorizer, normalization (Standard), improved parameters for ML model

In [None]:
import nltk
from nltk.corpus import stopwords
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

train_sentiments = pd.read_csv("/kaggle/input/sent-analysis/train/train.csv")
test_sentiments = pd.read_csv("/kaggle/input/sent-analysis/test/test.csv")

# Text preprocessing function with stemming and POS tagging
def preprocess_review(review):
    tokens = word_tokenize(review)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Use PorterStemmer for stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Perform POS tagging
    pos_tags = pos_tag(tokens)
    tokens = [word + '_' + pos for word, pos in pos_tags]

    return ' '.join(tokens)

train_sentiments['processed_reviews'] = train_sentiments['review'].apply(preprocess_review)
test_sentiments['processed_reviews'] = test_sentiments['review'].apply(preprocess_review)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_tfidf = tfidf_vectorizer.fit_transform(train_sentiments['processed_reviews'])
test_tfidf = tfidf_vectorizer.transform(test_sentiments['processed_reviews'] )

# Perform standardization for normalization
scaler = StandardScaler()
train_tfidf= scaler.fit_transform(train_tfidf.toarray())
test_tfidf = scaler.transform(test_tfidf.toarray())

# You can use train_tfidf_standardized and test_tfidf_standardized for further processing


In [None]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier

# Naive Bayes
nb_model = MultinomialNB(alpha=0.1) 
nb_model.fit(train_tfidf, train_sentiments['sentiment'])
nb_predictions = nb_model.predict(test_tfidf)
nb_accuracy = accuracy_score(test_sentiments['sentiment'], nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Classification Report for Naive Bayes:\n", classification_report(test_sentiments['sentiment'], nb_predictions))
print("\n")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42) 
rf_model.fit(train_tfidf, train_sentiments['sentiment'])
rf_predictions = rf_model.predict(test_tfidf)
rf_accuracy = accuracy_score(test_sentiments['sentiment'], rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Classification Report for Random Forest:\n", classification_report(test_sentiments['sentiment'], rf_predictions))
print("\n")

# Neural Network
nn_model =  MLPClassifier(hidden_layer_sizes=(128,), max_iter=200, alpha=0.0001, random_state=42)
nn_model.fit(train_tfidf, train_sentiments['sentiment'])
nn_predictions = nn_model.predict(test_tfidf)
nn_accuracy = accuracy_score(test_sentiments['sentiment'], nn_predictions)
print("Neural Network MLP CLASSIFIER Accuracy:", nn_accuracy)
print("Classification Report for Neural Network:\n", classification_report(test_sentiments['sentiment'], nn_predictions))
print("\n")

# Decision Tree
dt_model =  DecisionTreeClassifier(max_depth=10, min_samples_split=10, random_state=42)
dt_model.fit(train_tfidf, train_sentiments['sentiment'])
dt_predictions = dt_model.predict(test_tfidf)
dt_accuracy = accuracy_score(test_sentiments['sentiment'], dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print("Classification Report for Decision Tree:\n", classification_report(test_sentiments['sentiment'], dt_predictions))
print("\n")

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
gb_model.fit(train_tfidf, train_sentiments['sentiment'])
gb_predictions = gb_model.predict(test_tfidf)
gb_accuracy = accuracy_score(test_sentiments['sentiment'], gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Classification Report for Gradient Boosting:\n", classification_report(test_sentiments['sentiment'], gb_predictions))
print("\n")

# k-NN
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(train_tfidf, train_sentiments['sentiment'])
knn_predictions = knn_model.predict(test_tfidf)
knn_accuracy = accuracy_score(test_sentiments['sentiment'], knn_predictions)
print("k-NN Accuracy:", knn_accuracy)
print("Classification Report for k-NN:\n", classification_report(test_sentiments['sentiment'], knn_predictions))
print("\n")

# Stacking
stacking_model = StackingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model),('gb', gb_model)],
    final_estimator=GradientBoostingClassifier(n_estimators=100, random_state=42)
)
stacking_model.fit(train_tfidf, train_sentiments['sentiment'])
stacking_predictions = stacking_model.predict(test_tfidf)
stacking_accuracy = accuracy_score(test_sentiments['sentiment'], stacking_predictions)
print("Stacking Accuracy:", stacking_accuracy)
print("Classification Report for Stacking:\n", classification_report(test_sentiments['sentiment'], stacking_predictions))
print("\n")

# Bagging
bagging_model = BaggingClassifier(estimator= ExtraTreesClassifier(), n_estimators=200, random_state=42)
bagging_model.fit(train_tfidf, train_sentiments['sentiment'])
bagging_predictions = bagging_model.predict(test_tfidf)
bagging_accuracy = accuracy_score(test_sentiments['sentiment'], bagging_predictions)
print("Bagging Accuracy:", bagging_accuracy)
print("Classification Report for Bagging:\n", classification_report(test_sentiments['sentiment'], bagging_predictions))
print("\n")

# Voting Classifier
voting_model = VotingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model), ('gb', gb_model), ('st', stacking_model)],
    voting='hard'
)
voting_model.fit(train_tfidf, train_sentiments['sentiment'])
voting_predictions = voting_model.predict(test_tfidf)
voting_accuracy = accuracy_score(test_sentiments['sentiment'], voting_predictions)
print("Voting Classifier Accuracy:", voting_accuracy)
print("Classification Report for Voting Classifier:\n", classification_report(test_sentiments['sentiment'], voting_predictions))

porter stemmer with POS, tf-idf vectorizer, normalization (robust), improved parameters for ML model

In [None]:
import nltk
from nltk.corpus import stopwords
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import RobustScaler

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

train_sentiments = pd.read_csv("/kaggle/input/sent-analysis/train/train.csv")
test_sentiments = pd.read_csv("/kaggle/input/sent-analysis/test/test.csv")

# Text preprocessing function with stemming and POS tagging
def preprocess_review(review):
    tokens = word_tokenize(review)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token.lower() not in stop_words]

    # Use PorterStemmer for stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Perform POS tagging
    pos_tags = pos_tag(tokens)
    tokens = [word + '_' + pos for word, pos in pos_tags]

    return ' '.join(tokens)

train_sentiments['processed_reviews'] = train_sentiments['review'].apply(preprocess_review)
test_sentiments['processed_reviews'] = test_sentiments['review'].apply(preprocess_review)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
train_tfidf = tfidf_vectorizer.fit_transform(train_sentiments['processed_reviews'])
test_tfidf = tfidf_vectorizer.transform(test_sentiments['processed_reviews'] )

# Perform robust scaling for normalization
scaler = RobustScaler()
train_tfidf = scaler.fit_transform(train_tfidf.toarray())
test_tfidf = scaler.transform(test_tfidf.toarray())

# You can use train_tfidf_scaled and test_tfidf_scaled for further processing


In [None]:
from keras.models import Sequential
from keras.layers import Dense
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier

# Naive Bayes
nb_model = MultinomialNB(alpha=0.1) 
nb_model.fit(train_tfidf, train_sentiments['sentiment'])
nb_predictions = nb_model.predict(test_tfidf)
nb_accuracy = accuracy_score(test_sentiments['sentiment'], nb_predictions)
print("Naive Bayes Accuracy:", nb_accuracy)
print("Classification Report for Naive Bayes:\n", classification_report(test_sentiments['sentiment'], nb_predictions))
print("\n")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42) 
rf_model.fit(train_tfidf, train_sentiments['sentiment'])
rf_predictions = rf_model.predict(test_tfidf)
rf_accuracy = accuracy_score(test_sentiments['sentiment'], rf_predictions)
print("Random Forest Accuracy:", rf_accuracy)
print("Classification Report for Random Forest:\n", classification_report(test_sentiments['sentiment'], rf_predictions))
print("\n")

# Neural Network
nn_model =  MLPClassifier(hidden_layer_sizes=(128,), max_iter=200, alpha=0.0001, random_state=42)
nn_model.fit(train_tfidf, train_sentiments['sentiment'])
nn_predictions = nn_model.predict(test_tfidf)
nn_accuracy = accuracy_score(test_sentiments['sentiment'], nn_predictions)
print("Neural Network MLP CLASSIFIER Accuracy:", nn_accuracy)
print("Classification Report for Neural Network:\n", classification_report(test_sentiments['sentiment'], nn_predictions))
print("\n")

# Decision Tree
dt_model =  DecisionTreeClassifier(max_depth=10, min_samples_split=10, random_state=42)
dt_model.fit(train_tfidf, train_sentiments['sentiment'])
dt_predictions = dt_model.predict(test_tfidf)
dt_accuracy = accuracy_score(test_sentiments['sentiment'], dt_predictions)
print("Decision Tree Accuracy:", dt_accuracy)
print("Classification Report for Decision Tree:\n", classification_report(test_sentiments['sentiment'], dt_predictions))
print("\n")

# Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, random_state=42)
gb_model.fit(train_tfidf, train_sentiments['sentiment'])
gb_predictions = gb_model.predict(test_tfidf)
gb_accuracy = accuracy_score(test_sentiments['sentiment'], gb_predictions)
print("Gradient Boosting Accuracy:", gb_accuracy)
print("Classification Report for Gradient Boosting:\n", classification_report(test_sentiments['sentiment'], gb_predictions))
print("\n")

# k-NN
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(train_tfidf, train_sentiments['sentiment'])
knn_predictions = knn_model.predict(test_tfidf)
knn_accuracy = accuracy_score(test_sentiments['sentiment'], knn_predictions)
print("k-NN Accuracy:", knn_accuracy)
print("Classification Report for k-NN:\n", classification_report(test_sentiments['sentiment'], knn_predictions))
print("\n")

# Stacking
stacking_model = StackingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model),('gb', gb_model)],
    final_estimator=GradientBoostingClassifier(n_estimators=100, random_state=42)
)
stacking_model.fit(train_tfidf, train_sentiments['sentiment'])
stacking_predictions = stacking_model.predict(test_tfidf)
stacking_accuracy = accuracy_score(test_sentiments['sentiment'], stacking_predictions)
print("Stacking Accuracy:", stacking_accuracy)
print("Classification Report for Stacking:\n", classification_report(test_sentiments['sentiment'], stacking_predictions))
print("\n")

# Bagging
bagging_model = BaggingClassifier(estimator= ExtraTreesClassifier(), n_estimators=200, random_state=42)
bagging_model.fit(train_tfidf, train_sentiments['sentiment'])
bagging_predictions = bagging_model.predict(test_tfidf)
bagging_accuracy = accuracy_score(test_sentiments['sentiment'], bagging_predictions)
print("Bagging Accuracy:", bagging_accuracy)
print("Classification Report for Bagging:\n", classification_report(test_sentiments['sentiment'], bagging_predictions))
print("\n")

# Voting Classifier
voting_model = VotingClassifier(
    estimators=[('nb', nb_model), ('rf', rf_model), ('knn', knn_model), ('dt', dt_model), ('gb', gb_model), ('st', stacking_model)],
    voting='hard'
)
voting_model.fit(train_tfidf, train_sentiments['sentiment'])
voting_predictions = voting_model.predict(test_tfidf)
voting_accuracy = accuracy_score(test_sentiments['sentiment'], voting_predictions)
print("Voting Classifier Accuracy:", voting_accuracy)
print("Classification Report for Voting Classifier:\n", classification_report(test_sentiments['sentiment'], voting_predictions))