In [9]:
import mlflow
import mlflow.sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
    if pd.notnull(text):
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        tokens = text.split()
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        clean_text = ' '.join(tokens)
        return clean_text
    else:
        return ''

def preprocess_and_split_data(df, text_column, target_column, test_size=0.2, random_state=42):
    df['Cleaned Text'] = df[text_column].apply(preprocess_text)
    threshold = 3
    df['Sentiment'] = df[target_column].apply(lambda x: 1 if x >= threshold else 0)
    X_train, X_test, y_train, y_test = train_test_split(df['Cleaned Text'], df['Sentiment'], test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def train_model(X_train, y_train, pipeline, param_grid, scoring='f1_weighted', cv=5):
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, scoring=scoring, cv=cv, return_train_score=True, verbose=1)
    with mlflow.start_run() as run:
        grid_search.fit(X_train, y_train)
    return grid_search
    
def evaluate_model(model, X_train, X_test, y_train, y_test):
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    return {
        'F1 Score': f1_score(y_test, y_test_pred, average='weighted'),
        'Accuracy': accuracy_score(y_test, y_test_pred),
        'train_score': accuracy_score(y_train, y_train_pred),
        'test_score': accuracy_score(y_test, y_test_pred)
    }

def main():
    file_path = r"C:\Users\keert\Downloads\reviews_badminton\data.csv"
    text_column = 'Review text'
    target_column = 'Ratings'
    
    # Load dataset
    df = pd.read_csv(file_path)
    
    # Preprocess and split data
    X_train, X_test, y_train, y_test = preprocess_and_split_data(df, text_column, target_column)

    # Define models and pipelines
    models = {
        'Random Forest': RandomForestClassifier(),
        'Logistic Regression': LogisticRegression(),
        'Support Vector Machine': SVC()
    }

    pipelines = {}
    for model_name, model in models.items():
        pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer()),
            ('model', model)
        ])
        pipelines[model_name] = pipeline
    
    # Define hyperparameter grid for each model
    param_grids = {
        'Random Forest': {
            'model__n_estimators': [100, 200, 300],
            'model__max_depth': [None, 10, 20]
        },
        'Logistic Regression': {
            'model__C': [0.1, 1.0, 10.0],
            'model__solver': ['liblinear', 'lbfgs']
        },
        'Support Vector Machine': {
            'model__C': [0.1, 1.0, 10.0],
            'model__kernel': ['linear', 'rbf']
        }
    }
    
    best_models = {}
    results = {}

    # Train and evaluate models
    for model_name, pipeline in pipelines.items():
        param_grid = param_grids[model_name]
        grid_search = train_model(X_train, y_train, pipeline, param_grid)
        results[model_name] = evaluate_model(grid_search.best_estimator_, X_train, X_test, y_train, y_test)
        best_models[model_name] = grid_search.best_estimator_
    
    # Print results
    for model_name, metrics in results.items():
        print(f"Metrics for {model_name}:")
        print("Best hyperparameters:", best_models[model_name].get_params())
        print("F1 Score:", metrics['F1 Score'])
        print("Accuracy:", metrics['Accuracy'])
        print("Train Score:", metrics['train_score'])
        print("Test Score:", metrics['test_score'])
        print()

if __name__ == "__main__":
    main()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\keert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\keert\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Metrics for Random Forest:
Best hyperparameters: {'memory': None, 'steps': [('vectorizer', TfidfVectorizer()), ('model', RandomForestClassifier(n_estimators=200))], 'verbose': False, 'vectorizer': TfidfVectorizer(), 'model': RandomForestClassifier(n_estimators=200), 'vectorizer__analyzer': 'word', 'vectorizer__binary': False, 'vectorizer__decode_error': 'strict', 'vectorizer__dtype': <class 'numpy.float64'>, 'vectorizer__encoding': 'utf-8', 'vectorizer__input': 'content', 'vectorizer__lowercase': True, 'vectorizer__max_df': 1.0, 'vectorizer__max_features': None, 'vectorizer__min_df': 1, 'vectorizer__ngram_range': (1, 1), 'vectorizer__norm': 'l2', 'vectorizer__preprocessor': None, 'vectorizer__smooth_idf': True, 'vectorizer__stop_words': None, 'vectorizer__strip_accents': None, 'vectorizer__sublinear_tf': Fals