In [None]:
pip install pandas scikit-learn joblib openpyxl

In [34]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
import joblib
import pandas as pd

# Load the data
file_path = r"C:\Users\prati\OneDrive\Desktop\Project\Reddit\Comment_training.xlsx"  # Your file path
df = pd.read_excel(file_path)

# Step 1: Preprocess the data
df = df.dropna(subset=["Result"])  # Remove NaN values
label_encoder = LabelEncoder()
df["Sentiment"] = label_encoder.fit_transform(df["Result"])
df = df[df["Sentiment"] != -1]  # Remove N/A labels

# Step 2: Split the data into training and testing sets (80% train, 20% test)
X = df["Comment"]
y = df["Sentiment"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(kernel="linear", random_state=42),
    "Naive Bayes": MultinomialNB()
}

# Step 4: Training and evaluation
def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    best_model = None
    best_accuracy = 0
    results = {}

    for model_name, model in models.items():
        print(f"Training {model_name}...")

        # Create a pipeline with TF-IDF and the model
        pipeline = Pipeline([("tfidf", TfidfVectorizer(max_features=5000, stop_words="english")),
                             ("classifier", model)])

        # Train the model
        pipeline.fit(X_train, y_train)

        # Evaluate the model
        y_pred = pipeline.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Store results
        results[model_name] = {"accuracy": accuracy}

        # Check if this is the best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = pipeline
            best_model_name = model_name

        print(f"{model_name} Accuracy: {accuracy}\n")

    return best_model, best_model_name, best_accuracy, results

# Step 5: Train and evaluate models
best_model, best_model_name, best_accuracy, results = train_and_evaluate(models, X_train, y_train, X_test, y_test)

# Step 6: Print the best model and save it
print(f"Best Model: {best_model_name} with Accuracy: {best_accuracy}")

# Save the best model for future use
if best_model:
    joblib.dump(best_model, "best_sentiment_model_simplified.pkl")
    joblib.dump(label_encoder, "label_encoder.pkl")
    print("Best model saved as 'best_sentiment_model_simplified.pkl'.")


Training Logistic Regression...
Logistic Regression Accuracy: 0.7368421052631579

Training Random Forest...
Random Forest Accuracy: 0.631578947368421

Training Support Vector Machine...
Support Vector Machine Accuracy: 0.7368421052631579

Training Naive Bayes...
Naive Bayes Accuracy: 0.7368421052631579

Best Model: Logistic Regression with Accuracy: 0.7368421052631579
Best model saved as 'best_sentiment_model_simplified.pkl'.
