In [2]:
# Description: This script trains a logistic regression model to predict the sentiment of a movie review
# from the IMDB dataset. The model is trained using the TfidfVectorizer and LogisticRegression classes
# from scikit-learn. The model is saved to disk using the joblib library.
# Auth: William Bruckmann



import pandas as pd
import nltk 
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import string 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, roc_curve, auc
from logistic_regression_sentiment_analysis import main




ModuleNotFoundError: No module named 'matplotlib'

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = ''.join(word for word in text if not word.isdigit())
    tokens= nltk.word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    stemmer = SnowballStemmer("english")
    stemmed_tokens = [stemmer.stem(word) if not word.endswith('ion') else word for word in filtered_tokens]
    
    preprocess_text = ' '.join(stemmed_tokens)
    return preprocess_text




def main(train=False):
    # Define the file names for the model and vectorizer
    tfidf_vectorizer_file = 'tfidf_vectorizer.pkl'
    model_file = 'sentiment_analysis_model.pkl'

    # Check if both files exist
    if os.path.exists(tfidf_vectorizer_file) and os.path.exists(model_file):
        # Load the vectorizer and model from the files
        tfidf_vectorizer = joblib.load(tfidf_vectorizer_file)
        model = joblib.load(model_file)
        
        # Load the test data
        df = pd.read_csv('IMDB Dataset.csv')
        df['review'] = df['review'].apply(preprocess_text)
        X_test = tfidf_vectorizer.transform(df['review'])
        y_test = df['sentiment']

        
        # Make predictions on the test data
        y_pred = model.predict(X_test)
        y_pred_probs = model.predict_proba(X_test)[:, 1]
        
    else:
        # Load the dataset into a DataFrame
        df = pd.read_csv('IMDB Dataset.csv')

        # Apply the preprocess_text function to the 'review' column
        df['review'] = df['review'].apply(preprocess_text)

        # Create and save the TfidfVectorizer
        tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
        tfidf_features = tfidf_vectorizer.fit_transform(df['review'])
        joblib.dump(tfidf_vectorizer, tfidf_vectorizer_file)

        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(tfidf_features, df['sentiment'], test_size=0.2, random_state=42)

        # Train and save the model
        model = LogisticRegression(max_iter=1000)
        model.fit(X_train, y_train)
        joblib.dump(model, model_file)

        # Make predictions on the test data
        y_pred = model.predict(X_test)
        y_pred_probs = model.predict_proba(X_test)[:, 1]


    return X_test, y_test, y_pred, y_pred_probs, df, model, tfidf_vectorizer


if __name__ == '__main__':
    main(train=True)


In [None]:



def plot_confusion_matrix(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    labels = ['negative', 'positive']
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

def plot_class_distribution(df):
    class_counts = df['sentiment'].value_counts()

    plt.figure(figsize=(8, 6))
    class_counts.plot(kind='bar')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.title('Class Distribution')
    plt.show()

def plot_roc_curve(y_true, y_pred_probs):
    fpr, tpr, thresholds = roc_curve(y_true, y_pred_probs, pos_label='positive')
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()


if __name__ == '__main__':
    X_test, y_test, y_pred, y_pred_probs, df, model, tfidf_vectorizer = main(train = True)

    plot_confusion_matrix(y_test, y_pred)
    plot_class_distribution(df)
    plot_roc_curve(y_test, y_pred_probs)




In [4]:
import joblib
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Define the file names for the model and vectorizer
tfidf_vectorizer_file = '../tfidf_vectorizer.pkl'
model_file = '../sentiment_analysis_model.pkl'

# Load the TfidfVectorizer and model
tfidf_vectorizer = joblib.load(tfidf_vectorizer_file)
model = joblib.load(model_file)

# Define a function to get user input and make predictions
def predict_sentiment():
    # Get input from user
    input_text = input('Enter some text: ')

    # Preprocess the input text
    preprocessed_text = preprocess_text(input_text)
    print(preprocessed_text)

    # Vectorize the preprocessed text
    input_vector = tfidf_vectorizer.transform([preprocessed_text])

    # Make prediction using the model
    prediction = model.predict(input_vector)[0]

    # Print the prediction
    if prediction == 'positive':
        print('The text has a positive sentiment.')
    else:
        print('The text has a negative sentiment.')

if __name__ == '__main__':
    predict_sentiment()


ModuleNotFoundError: No module named 'scipy.sparse._csr'