In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline

In [3]:
# Read data
data = pd.read_csv("twitter.csv", encoding="latin1")
df = pd.DataFrame(data)
print("Dataset preview:")
print(df.head(5))

# Check for null values in the text column
null_count = df['text'].isna().sum()
print(f"\nNumber of null values in text column: {null_count}")

# Drop rows with NaN values in the text column
df = df.dropna(subset=['text'])
print(f"Dataset shape after removing null values: {df.shape}")


Dataset preview:
                                                text     label
0  im getting on borderlands and i will murder yo...  Positive
1  I am coming to the borders and I will kill you...  Positive
2  im getting on borderlands and i will kill you ...  Positive
3  im coming on borderlands and i will murder you...  Positive
4  im getting on borderlands 2 and i will murder ...  Positive

Number of null values in text column: 686
Dataset shape after removing null values: (73996, 2)


In [4]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42
)
print(f"\nTraining data size: {X_train.shape[0]}")
print(f"Testing data size: {X_test.shape[0]}")



Training data size: 59196
Testing data size: 14800


In [5]:
# 1. Basic Model with CountVectorizer
print("\n--- Model 1: Basic CountVectorizer + MultinomialNB ---")
count_vectorizer = CountVectorizer()
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_counts, y_train)

# Evaluate basic model
y_pred = nb_classifier.predict(X_test_counts)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))





--- Model 1: Basic CountVectorizer + MultinomialNB ---
Accuracy: 0.7495

Classification Report:
              precision    recall  f1-score   support

  Irrelevant       0.84      0.62      0.71      2696
    Negative       0.70      0.86      0.77      4380
     Neutral       0.83      0.63      0.72      3605
    Positive       0.72      0.82      0.77      4119

    accuracy                           0.75     14800
   macro avg       0.77      0.73      0.74     14800
weighted avg       0.76      0.75      0.75     14800


Confusion Matrix:
[[1665  494  100  437]
 [  78 3774  180  348]
 [ 143  642 2264  556]
 [  89  459  181 3390]]


In [6]:
# 2. Advanced Model with TF-IDF and parameter tuning
print("\n--- Model 2: TF-IDF Vectorizer with Parameter Tuning ---")
# Create a pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', MultinomialNB())
])

# Parameter grid for grid search
param_grid = {
    'tfidf__max_features': [5000, 10000],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'tfidf__min_df': [2, 5],
    'classifier__alpha': [0.1, 0.5, 1.0]
}

# Grid search for optimal parameters
grid_search = GridSearchCV(
    pipeline, param_grid, cv=5, scoring='accuracy', verbose=1
)
grid_search.fit(X_train, y_train)



--- Model 2: TF-IDF Vectorizer with Parameter Tuning ---
Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [7]:
# Print best parameters and score
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Evaluate best model
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
accuracy_best = accuracy_score(y_test, y_pred_best)
print(f"\nTest accuracy with best model: {accuracy_best:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_best))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_best))

# 3. Testing with custom input examples
print("\n--- Testing with Custom Input Examples ---")
examples = [
    "I love this product, it's amazing!",
    "This is the worst experience ever",
    "The movie was okay, nothing special",
    "I'm feeling happy today",
    "This made me very angry and disappointed"
]

# Get predictions using the best model
predictions = best_model.predict(examples)

# Show results
print("\nCustom Input Predictions:")
for example, prediction in zip(examples, predictions):
    print(f"Text: '{example}'")
    print(f"Predicted sentiment: {prediction}")
    print()

# Allow for interactive input
print("\n--- Interactive Sentiment Analysis ---")
while True:
    user_input = input("\nEnter text for sentiment analysis (or 'quit' to exit): ")
    if user_input.lower() == 'quit':
        break
    
    # Predict sentiment
    sentiment = best_model.predict([user_input])[0]
    probability = np.max(best_model.predict_proba([user_input])) * 100
    
    print(f"Sentiment: {sentiment}")
    print(f"Confidence: {probability:.2f}%")


Best parameters: {'classifier__alpha': 0.1, 'tfidf__max_features': 10000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 1)}
Best cross-validation accuracy: 0.7032

Test accuracy with best model: 0.7005

Classification Report:
              precision    recall  f1-score   support

  Irrelevant       0.82      0.47      0.60      2696
    Negative       0.67      0.84      0.75      4380
     Neutral       0.74      0.60      0.67      3605
    Positive       0.67      0.79      0.72      4119

    accuracy                           0.70     14800
   macro avg       0.73      0.68      0.68     14800
weighted avg       0.72      0.70      0.69     14800


Confusion Matrix:
[[1270  579  246  601]
 [  64 3678  250  388]
 [ 124  669 2174  638]
 [  92  524  257 3246]]

--- Testing with Custom Input Examples ---

Custom Input Predictions:
Text: 'I love this product, it's amazing!'
Predicted sentiment: Positive

Text: 'This is the worst experience ever'
Predicted sentiment: Negative

Text: 'T