In [37]:
# a script to open a file and store its content in a variable

def read_file(file: str):
    """ Open a file and return it's content
    Arg:
        file: The name of the file to open
    Return: contents of the file
    """
    if file:
        content = ""
        with open(file=file) as f:
            content = f.read()
        return content
    return "Please specify the file to read"

In [None]:
# Building a Model with Random Forest Algorithm
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import hstack


def data_cleaner(file_path: str):
    """Performs Data cleaning and feature engineering tasks
    Arg:
        file_path: the path to a file containing data
    Return: clean dataframe.
    """
    try:
        data = pd.read_csv(file_path)

        # Drop rows with missing labels
        data = data.dropna(subset=['issue_label'])
        # Remove all duplicate rows
        data.drop_duplicates(inplace=True)
        # Fill missing values for other columns (if necessary)
        data['issue_body'] = data['issue_body'].fillna('NA')
        data['issue_title'] = data['issue_title'].fillna('NA')
        data['issue_author_association'] = data['issue_author_association'].fillna('unknown')
        # dropping unnecessary features
        data = data.drop(columns=['issue_url', 'repository_url', 'issue_created_at'])

        return data
    except FileNotFoundError as e:
        return "Please specify a complete path to the file with data"

def data_preprocessor(data):
    """
    Vectorization and feature engineering
    Arg:
        data: features from the raw data that are be be processed
    Return: preprocessed features into a single dataset
    """
    
    # TF-IDF Vectorizers for title and body
    title_vectorizer = TfidfVectorizer(stop_words='english')
    body_vectorizer = TfidfVectorizer(stop_words='english')
    # Encode author association
    encoder = OneHotEncoder(sparse_output=False)

    title = title_vectorizer.fit_transform(data['issue_title'])
    body = body_vectorizer.fit_transform(data['issue_body'])
    author = encoder.fit_transform(data['issue_author_association'].to_frame())

    # combine all the features
    X = hstack([author, title, body])

    return X

def data_splitter(X, y):
    """Spliting preprocessed data into training and testing data
    Arg:
        X: preprocessed data to be split
        y: labels to the data
    Return: tuple of split data (X_train, X_test, y_train, y_test)
    """

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def model_trainer(X_train, y_train):
    """Trains the model
    Args:
        X_train: preprocessed training data without labels
        y_train: preprocessed training data labels
    Return: the trained model instance
    """
    # Create an instance of Random Forest Classifier
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # train the model
    rf_model.fit(X_train, y_train)

    return rf_model

def model_evaluater(X_test, y_test, model):
    """Generate a model evaluation report using the unseen test data
    Args:
        X_test: preprocessed data without labels and previous unseen during training
        y_test: labels to this data which are used for evaluation
        model: the model to test
    Return: A dict of evaluation report and predicted values
    """
    # Make predictions
    y_pred = model.predict(X_test)

    # Classification report
    return {
        'report': classification_report(y_test, y_pred),
        'y_pred': y_pred,
    }

def visualisation(y_pred, y_test):
    """visualise the model
    Arg:
        y_pred: predicted values
    """
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()


data = data_cleaner('dataset/sample1.csv')
model_inputs = data.drop(columns=['issue_label'])
labels = data['issue_label'].to_frame()
#encode labels
encoder = LabelEncoder()
y = encoder.fit_transform(labels)

X = data_preprocessor(model_inputs)
X_train, X_test, y_train, y_test = data_splitter(X, y)

model = model_trainer(X_train, y_train)
evaluation_data = model_evaluater(X_test, y_test, model)
y_pred = evaluation_data['y_pred']
visualisation(y_pred, y_test)

print(evaluation_data['report'])