In [6]:
import nltk

# Download the punkt tokenizer models
nltk.download('punkt')
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [9]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load your dataset
file_path = r'C:\Users\asshe\Desktop\Sentiment analysis\sentimentdataset.csv'
df = pd.read_csv(file_path)

# Clean the 'Text' column
df['Cleaned_Text'] = df['Text'].apply(lambda x: x.lower())  # Convert to lowercase

# Tokenize, remove stopwords, and lemmatize
df['Cleaned_Text'] = df['Cleaned_Text'].apply(lambda x: word_tokenize(x))
df['Cleaned_Text'] = df['Cleaned_Text'].apply(lambda x: [word for word in x if word not in stopwords.words('english')])

lemmatizer = WordNetLemmatizer()
df['Cleaned_Text'] = df['Cleaned_Text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Continue with your sentiment analysis...
df['Cleaned_Text'] = df['Cleaned_Text'].apply(lambda x: ' '.join(x))

# Encode the sentiment labels
le = LabelEncoder()
df['Encoded_Sentiment'] = le.fit_transform(df['Sentiment'])

# Vectorize the cleaned text using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform(df['Cleaned_Text'])

# Define the target variable
y = df['Encoded_Sentiment']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model with class weights
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X_train, y_train)

# Predict on the test data
y_pred = model.predict(X_test)

# Print accuracy and classification report
labels = sorted(set(y_test))  # Get unique labels in y_test
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, labels=labels, target_names=le.inverse_transform(labels)))

# Save the vectorizer and model
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

with open('sentiment_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Load the vectorizer and model
with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_vectorizer = pickle.load(file)

with open('sentiment_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Sample preprocessing function
def preprocess_text(text):
    cleaned_text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\@\w+|\#', '', cleaned_text)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    cleaned_text = cleaned_text.lower().split()
    cleaned_text = [word for word in cleaned_text if word not in stopwords.words('english')]
    cleaned_text = ' '.join(cleaned_text)
    return cleaned_text

def predict_sentiment(new_statement):
    cleaned_statement = preprocess_text(new_statement)
    transformed_statement = loaded_vectorizer.transform([cleaned_statement])
    prediction = loaded_model.predict(transformed_statement)
    sentiment_label = le.inverse_transform(prediction)
    return sentiment_label[0]

def predict_sentiment_with_proba(new_statement):
    cleaned_statement = preprocess_text(new_statement)
    transformed_statement = loaded_vectorizer.transform([cleaned_statement])
    prediction_proba = loaded_model.predict_proba(transformed_statement)
    sentiment_label = le.inverse_transform([prediction_proba.argmax()])
    return sentiment_label[0], prediction_proba

# Example usage
new_statement = "I hate this product!"
predicted_sentiment = predict_sentiment(new_statement)
print(f"The predicted sentiment for the statement is: {predicted_sentiment}")

test_statements = [
    "I love this product!",
    "This is the worst thing ever.",
    "Absolutely fantastic!",
    "I hate this product."
]

for statement in test_statements:
    print(f"Statement: {statement}")
    print(f"Predicted Sentiment: {predict_sentiment(statement)}")

# Example usage with probabilities
predicted_sentiment, proba = predict_sentiment_with_proba(new_statement)
print(f"The predicted sentiment for the statement is: {predicted_sentiment}")
print(f"Prediction probabilities: {proba}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.2108843537414966
                        precision    recall  f1-score   support

         Acceptance          0.00      0.00      0.00         2
           Admiration        0.00      0.00      0.00         1
        Admiration           0.00      0.00      0.00         1
         Affection           1.00      1.00      1.00         1
      Ambivalence            1.00      1.00      1.00         1
         Anger               0.00      0.00      0.00         1
        Anticipation         0.00      0.00      0.00         1
        Arousal              0.38      1.00      0.55         3
                  Awe        0.00      0.00      0.00         1
         Awe                 0.00      0.00      0.00         1
                  Bad        1.00      1.00      1.00         1
             Betrayal        0.00      0.00      0.00         2
        Betrayal             0.00      0.00      0.00         1
         Bitter              1.00      1.00      1.00         1
          

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load your dataset
file_path = r'C:\Users\asshe\Downloads\twitter_training.csv'
df = pd.read_csv(file_path)

# Clean the 'Text' column
df['Cleaned_Text'] = df['Text'].apply(lambda x: x.lower())  # Convert to lowercase

# Tokenize, remove stopwords, and lemmatize
df['Cleaned_Text'] = df['Cleaned_Text'].apply(lambda x: word_tokenize(x))
df['Cleaned_Text'] = df['Cleaned_Text'].apply(lambda x: [word for word in x if word not in stopwords.words('english')])

lemmatizer = WordNetLemmatizer()
df['Cleaned_Text'] = df['Cleaned_Text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Continue with your sentiment analysis...
df['Cleaned_Text'] = df['Cleaned_Text'].apply(lambda x: ' '.join(x))

# Encode the sentiment labels
le = LabelEncoder()
df['Encoded_Sentiment'] = le.fit_transform(df['Sentiment'])
print(le.classes_)  # Print out the classes to ensure correct encoding

# Vectorize the cleaned text using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')
X = vectorizer.fit_transform(df['Cleaned_Text'])

# Define the target variable
y = df['Encoded_Sentiment']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(rf_model, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best model from GridSearch
best_rf_model = grid_search.best_estimator_

# Predict on the test data
y_pred = best_rf_model.predict(X_test)

# Print accuracy and classification report
labels = sorted(set(y_test))  # Get unique labels in y_test
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, labels=labels, target_names=le.inverse_transform(labels)))

# Save the vectorizer and model
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

with open('sentiment_model.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)

# Load the vectorizer and model
with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_vectorizer = pickle.load(file)

with open('sentiment_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Sample preprocessing function
def preprocess_text(text):
    cleaned_text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\@\w+|\#', '', cleaned_text)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    cleaned_text = cleaned_text.lower().split()
    cleaned_text = [word for word in cleaned_text if word not in stopwords.words('english')]
    cleaned_text = ' '.join(cleaned_text)
    return cleaned_text

def predict_sentiment(new_statement):
    cleaned_statement = preprocess_text(new_statement)
    transformed_statement = loaded_vectorizer.transform([cleaned_statement])
    prediction = loaded_model.predict(transformed_statement)
    sentiment_label = le.inverse_transform(prediction)
    return sentiment_label[0]

def predict_sentiment_with_proba(new_statement):
    cleaned_statement = preprocess_text(new_statement)
    transformed_statement = loaded_vectorizer.transform([cleaned_statement])
    prediction_proba = loaded_model.predict_proba(transformed_statement)
    sentiment_label = le.inverse_transform([prediction_proba.argmax()])
    return sentiment_label[0], prediction_proba

# Example usage
new_statement = "I hate this product!"
predicted_sentiment, proba = predict_sentiment_with_proba(new_statement)
print(f"The predicted sentiment for the statement is: {predicted_sentiment}")
print(f"Prediction probabilities: {proba}")

test_statements = [
    "I love this product!",
    "This is the worst thing ever.",
    "Absolutely fantastic!",
    "I hate this product."
]

for statement in test_statements:
    print(f"Statement: {statement}")
    print(f"Predicted Sentiment: {predict_sentiment(statement)}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


KeyError: 'Text'

In [13]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
import pickle
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load your dataset
file_path = r'C:\Users\asshe\Desktop\Sentiment analysis\sentimentdataset.csv'
df = pd.read_csv(file_path)

# Clean the 'Text' column
def preprocess_text(text):
    cleaned_text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\@\w+|\#', '', cleaned_text)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    cleaned_text = cleaned_text.lower().split()
    cleaned_text = [word for word in cleaned_text if word not in stopwords.words('english')]
    cleaned_text = ' '.join(cleaned_text)
    return cleaned_text

df['Cleaned_Text'] = df['Text'].apply(preprocess_text)

# Encode the sentiment labels
le = LabelEncoder()
df['Encoded_Sentiment'] = le.fit_transform(df['Sentiment'])
print(le.classes_)  # Print out the classes to ensure correct encoding

# Vectorize the cleaned text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Cleaned_Text'])

# Define the target variable
y = df['Encoded_Sentiment']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check class distribution before applying SMOTE
print(f"Original class distribution: {Counter(y_train)}")

# Handle class imbalance using SMOTE, adjust k_neighbors if necessary
if len(Counter(y_train)) > 1 and min(Counter(y_train).values()) > 1:
    smote = SMOTE(random_state=42, k_neighbors=min(5, min(Counter(y_train).values()) - 1))
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
    print(f"Resampled class distribution: {Counter(y_train_resampled)}")
else:
    print("Skipping SMOTE due to insufficient samples in minority class.")
    X_train_resampled, y_train_resampled = X_train, y_train

# Set up RandomForest model with hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_resampled, y_train_resampled)

# Best model
best_rf_model = grid_search.best_estimator_

# Predict on the test data
y_pred = best_rf_model.predict(X_test)

# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Save the vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

# Save the model
with open('sentiment_model.pkl', 'wb') as file:
    pickle.dump(best_rf_model, file)

# Load the vectorizer
with open('tfidf_vectorizer.pkl', 'rb') as file:
    loaded_vectorizer = pickle.load(file)

# Load the model
with open('sentiment_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Function to predict sentiment
def predict_sentiment(new_statement):
    cleaned_statement = preprocess_text(new_statement)
    transformed_statement = loaded_vectorizer.transform([cleaned_statement])
    prediction = loaded_model.predict(transformed_statement)
    sentiment_label = le.inverse_transform(prediction)
    return sentiment_label[0]

# Example usage
test_statements = [
    "I love this product!",
    "This is the worst thing ever.",
    "Absolutely fantastic!",
    "I hate this product."
]

for statement in test_statements:
    print(f"Statement: {statement}")
    print(f"Predicted Sentiment: {predict_sentiment(statement)}")

# Function to predict sentiment with probability
def predict_sentiment_with_proba(new_statement):
    cleaned_statement = preprocess_text(new_statement)
    transformed_statement = loaded_vectorizer.transform([cleaned_statement])
    prediction_proba = loaded_model.predict_proba(transformed_statement)
    sentiment_label = le.inverse_transform([prediction_proba.argmax()])
    return sentiment_label[0], prediction_proba

# Example usage with probabilities
new_statement = "I hate this product!"
predicted_sentiment, proba = predict_sentiment_with_proba(new_statement)
print(f"The predicted sentiment for the statement is: {predicted_sentiment}")
print(f"Prediction probabilities: {proba}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asshe\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[' Acceptance   ' ' Acceptance      ' ' Accomplishment ' ' Admiration '
 ' Admiration   ' ' Admiration    ' ' Adoration    ' ' Adrenaline     '
 ' Adventure ' ' Affection    ' ' Amazement ' ' Ambivalence '
 ' Ambivalence     ' ' Amusement    ' ' Amusement     ' ' Anger        '
 ' Anticipation ' ' Anticipation  ' ' Anxiety   ' ' Anxiety         '
 ' Appreciation  ' ' Apprehensive ' ' Arousal       ' ' ArtisticBurst '
 ' Awe ' ' Awe    ' ' Awe          ' ' Awe           ' ' Bad '
 ' Betrayal ' ' Betrayal      ' ' Bitter       ' ' Bitterness '
 ' Bittersweet ' ' Blessed       ' ' Boredom ' ' Boredom         '
 ' Breakthrough ' ' Calmness     ' ' Calmness      ' ' Captivation '
 ' Celebration ' ' Celestial Wonder ' ' Challenge ' ' Charm ' ' Colorful '
 ' Compassion' ' Compassion    ' ' Compassionate ' ' Confidence    '
 ' Confident ' ' Confusion ' ' Confusion    ' ' Confusion       '
 ' Connection ' ' Contemplation ' ' Contentment ' ' Contentment   '
 ' Coziness     ' ' Creative Inspirati



Accuracy: 0.29931972789115646


ValueError: Number of classes, 123, does not match size of target_names, 279. Try specifying the labels parameter

In [22]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Define the column names based on the structure observed
column_names = ['ID', 'Game', 'Sentiment', 'Text']

# Load the dataset with correct column names and delimiters
df = pd.read_csv(r'C:\Users\asshe\Desktop\Sentiment analysis\twitter_training.csv', delimiter=',', names=column_names, quotechar='"')

# Display the first few rows to confirm the correct structure
print(df.head())

# Handle NaN values in the 'Text' column by filling them with an empty string
df['Text'] = df['Text'].fillna('')

# Preprocess the data
# Convert text to lowercase (you can add more preprocessing steps as needed)
df['Text'] = df['Text'].str.lower()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Sentiment'], test_size=0.2, random_state=42)

# Vectorize the text data
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_vect, y_train)

# Predict on the test set
y_pred = model.predict(X_test_vect)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Predict the sentiment of a new statement
def predict_sentiment(new_statement):
    new_statement_vect = vectorizer.transform([new_statement])
    prediction = model.predict(new_statement_vect)
    return prediction[0]

# Map sentiment to emoji
def sentiment_to_emoji(sentiment):
    if sentiment == 'Positive':
        return '😊'
    elif sentiment == 'Negative':
        return '😐'
    else:
        return '😐'  # Default to neutral emoji for unknown sentiment

# Get user input
user_input = input("Enter a statement to analyze its sentiment: ")

# Predict the sentiment of the user's input
predicted_sentiment = predict_sentiment(user_input)
predicted_emoji = sentiment_to_emoji(predicted_sentiment)

print("Predicted Sentiment for '{}' : {} {}".format(user_input, predicted_sentiment, predicted_emoji))

     ID         Game Sentiment  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                Text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.8189730200174065
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.84      0.74      0.79      2592
    Negative       0.85      0.86      0.85      4519
     Neutral       0.84      0.78      0.81      3596
    Positive       0.76      0.86      0.81      4230

    accuracy                           0.82     14937
   macro avg       0.82      0.81      0.81     14937
weighted avg       0.82      0.82      0.82     14937



Enter a statement to analyze its sentiment:  i love this product


Predicted Sentiment for 'i love this product' : Positive 😊
