In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Step 1: Load the dataset with the correct delimiter
# Replace 'sentiment-analysis.csv' with the actual file path.
df = pd.read_csv('sentiment-analysis.csv', delimiter=',')  # Use ',' or '\t' based on your file
print(f"Dataset loaded with {len(df)} records.")
print("Dataset columns:", df.columns)

# Step 2: Split combined column if necessary
if len(df.columns) == 1:
    # Assuming the data is separated by commas in the combined column
    df = df['Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score'].str.split(',', expand=True)
    # Rename columns based on their order in the dataset
    df.columns = ['Text', 'Sentiment', 'Source', 'Date/Time', 'User ID', 'Location', 'Confidence Score']

print("Updated columns after splitting:", df.columns)

# Step 3: Data Cleaning
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    # Convert to lowercase
    text = text.lower()
    # Tokenization and remove stopwords
    words = [word for word in text.split() if word not in stop_words]
    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Use the actual column name for reviews and sentiments
df['cleaned_text'] = df['Text'].apply(clean_text)  # Replace 'Text' with your column name for reviews
print("Text cleaning completed.")

# Step 4: Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text']).toarray()
y = df['Sentiment']  # Replace 'Sentiment' with your column name for sentiment labels

# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Data split into train and test sets. Training on {len(X_train)} samples.")

# Step 6: Model Selection and Training
model = LogisticRegression()
model.fit(X_train, y_train)
print("Model training completed.")

# Step 7: Model Evaluation
y_pred = model.predict(X_test)

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=model.classes_, yticklabels=model.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/varikuntlasaimanoj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/varikuntlasaimanoj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Dataset loaded with 98 records.
Dataset columns: Index(['Text, Sentiment, Source, Date/Time, User ID, Location, Confidence Score'], dtype='object')
Updated columns after splitting: Index(['Text', 'Sentiment', 'Source', 'Date/Time', 'User ID', 'Location',
       'Confidence Score'],
      dtype='object')
Text cleaning completed.
Data split into train and test sets. Training on 78 samples.


ValueError: Input contains NaN