In [None]:
from tkinter import *
from tkinter import filedialog

# Function to allow the user to select the folder containing the data.
# Function inputs arg 1: message [string] --> The title for the GUI. 
# Function output 1: directory [string] --> The path of that the folder selected by the user. 
def select_folder(message):
    root = Tk()
    root.title(message)
    root.filename = filedialog.askdirectory(initialdir="/", title=message)
    directory = root.filename
    root.destroy()

    return directory

In [None]:
import pandas as pd 
import os as os 

# Function to load in data from the "sentiment labelled sentences" dataset. 
# Function input arg 1: directory [string] --> The sentiment labelled sentences directory, containing the txt files. 
# Function output 1: df [DataFrame] --> The sentences with corresponding labels and companies.
def load_data(directory):
    
    # Construct a dict to store filepath data for each dataset. 
    filepath_dict = dict.fromkeys(['yelp', 'amazon', 'imdb'])
    filepath_dict['yelp'] = os.path.join(directory, 'yelp_labelled.txt')
    filepath_dict['amazon'] = os.path.join(directory, 'amazon_cells_labelled.txt')
    filepath_dict['imdb'] = os.path.join(directory,  'imdb_labelled.txt')
    
    # Add the .txt data to a pandas DataFrame.
    df_list = [] 
    for company, filepath in filepath_dict.items():
        df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t') # Where \t is a string literal for the tab character. 
        df['company'] = company
        df_list.append(df)
        
    df = pd.concat(df_list)
    
    return df 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Function to vectorize the sentences.
# Function input arg 1: df [DataFrame] --> The sentences with corresponding labels and companies.
# Function output 1: sentences_BOW [array] --> The vectorized sentences, as prepared for a bag of words model. 
# Function output 2: labels [array] --> The labels. 
def vectorize_sentences(df): 
    
    # Get the sentences from the DataFrame. 
    sentences = df['sentence'].tolist()
    
    # Vectorize the sentences.
    vectorizer = CountVectorizer(min_df=0, lowercase=False)
    vectorizer.fit(sentences)
    sentences_BOW = vectorizer.transform(sentences).toarray() # BOW: bag of words. 
    
    # Get the array of labels. 
    labels = df['label'].to_numpy()
    
    return sentences_BOW, labels 

In [None]:
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression

# Function to use the model. 
def use_model(): 
    
    #### (1) Load the data and vectorize it. 
    
    # Select the directory via a GUI. 
    directory = select_folder('Please select the "sentiment labelled sentences" txt datasets.') 
    
    # Load the data. 
    df = load_data(directory)
    
    # Vectorize the data. 
    sentences_BOW, labels = vectorize_sentences(df)
    
    #### (1) Split the dataset into training and testing subsets.
    
    x_train, x_test, y_train, y_test = train_test_split(sentences_BOW, labels, test_size=0.1)
    
    #### (2) Create and fit data to our model. 
    
    classifier = LogisticRegression()
    classifier.fit(x_train, y_train) 
    score = classifier.score(x_test, y_test)
    
    # Print the accuracy. 
    print(f"The accuracy is {'{0:.3f}'.format(score)}%")