In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Improved preprocessing
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    
    # Keep negation words by modifying stopwords list
    custom_stopwords = set(stopwords.words('english')) - {'not', 'no', 'nor', 'neither'}
    
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in custom_stopwords]
    return ' '.join(tokens)

# Load data
file_path = r'twitter_training.csv'
df = pd.read_csv(file_path, header=None)
df.columns = ['ID', 'Game', 'Sentiment', 'Text']

# Drop rows with missing values
df = df.dropna(subset=['Text', 'Sentiment'])

# Preprocess
df['Cleaned_Text'] = df['Text'].apply(preprocess_text)

# Encode labels
le = LabelEncoder()
df['Encoded_Sentiment'] = le.fit_transform(df['Sentiment'])

# Check class distribution
print("Class distribution:")
print(df['Sentiment'].value_counts())

# Use n-grams in TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X = tfidf.fit_transform(df['Cleaned_Text'])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, df['Encoded_Sentiment'], test_size=0.2, random_state=42)

# Train model with class weights
class_weights = 'balanced'  # Automatically adjusts weights inversely proportional to class frequencies
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight=class_weights)
rf.fit(X_train, y_train)

# Evaluate
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

# Prediction function
def predict_sentiment(new_statement):
    cleaned_statement = preprocess_text(new_statement)
    transformed_statement = tfidf.transform([cleaned_statement])
    prediction = rf.predict(transformed_statement)
    return le.inverse_transform(prediction)[0]

# Test cases
test_cases = [
    "I hate this product!",
    "product is not good",
    "this is amazing",
    "not bad at all",
    "worst experience ever"
]

print("\nTesting sample statements:")
for statement in test_cases:
    sentiment = predict_sentiment(statement)
    print(f"'{statement}': {sentiment}")

# User interaction
while True:
    user_input = input("\nEnter a statement to analyze (or 'quit' to exit): ")
    if user_input.lower() == 'quit':
        break
    predicted_sentiment = predict_sentiment(user_input)
    print(f"Predicted Sentiment: {predicted_sentiment}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Class distribution:
Sentiment
Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: count, dtype: int64

Model Accuracy: 87.64%
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.93      0.80      0.86      2696
    Negative       0.89      0.91      0.90      4380
     Neutral       0.82      0.88      0.85      3605
    Positive       0.88      0.89      0.88      4119

    accuracy                           0.88     14800
   macro avg       0.88      0.87      0.87     14800
weighted avg       0.88      0.88      0.88     14800


Testing sample statements:
'I hate this product!': Negative
'product is not good': Negative
'this is amazing': Positive
'not bad at all': Negative
'worst experience ever': Negative



Enter a statement to analyze (or 'quit' to exit):  it is not good product


Predicted Sentiment: Negative



Enter a statement to analyze (or 'quit' to exit):  it is not as it looks like 


Predicted Sentiment: Negative



Enter a statement to analyze (or 'quit' to exit):  good product


Predicted Sentiment: Positive



Enter a statement to analyze (or 'quit' to exit):  quit
