In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Function to preprocess text
def preprocess_text(text):
    if not isinstance(text, str):
        return ""  # Return empty string for non-string values (e.g., floats or NaNs)
    
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize the text
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords.words('english')]
    return ' '.join(tokens)

# Load the data
file_path = r'C:\Users\asshe\Desktop\Sentiment analysis\twitter_training.csv'
df = pd.read_csv(file_path, header=None)

# Rename columns appropriately (you may need to adjust these names based on your dataset)
df.columns = ['ID', 'Game', 'Sentiment', 'Text']

# Apply preprocessing to the 'Text' column, handling floats
df['Cleaned_Text'] = df['Text'].apply(preprocess_text)

# Encode the sentiment labels
le = LabelEncoder()
df['Encoded_Sentiment'] = le.fit_transform(df['Sentiment'])

# Vectorize the cleaned text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['Cleaned_Text'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['Encoded_Sentiment'], test_size=0.2, random_state=42)

# Train a RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Evaluate the model
y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Function to predict sentiment for new statements
def predict_sentiment(new_statement):
    cleaned_statement = preprocess_text(new_statement)
    transformed_statement = tfidf.transform([cleaned_statement])
    prediction = rf.predict(transformed_statement)
    return le.inverse_transform(prediction)[0]

# Example usage
new_statement = "I hate this product!"
predicted_sentiment = predict_sentiment(new_statement)
print(f"The predicted sentiment for the statement is: {predicted_sentiment}")


Model Accuracy: 87.22%
The predicted sentiment for the statement is: Negative
