<h1>Wells Fargo’s Campus Analytics 2022 Challenge<h1>

<h2>Categorizing Transactions<h2>

In [None]:
from sklearnex import patch_sklearn
patch_sklearn()
#Import the LinearSVC from Sklearn.
from sklearn.svm import LinearSVC

#Use pandas library to be able to manipulate, import, and export data.
import pandas as pd

#Import TfidfVectorizer from the Sklearn library that would help with the extraction of features from text.
from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
#Import training data.
data_for_training = pd.read_csv(r'training_Data_CAC_2022.csv')

#Select the columns relevant for our model.
training_data = pd.DataFrame(data_for_training, columns= ['merchant_cat_code', 'coalesced_brand', 'category'])

#Import testing data.
data_for_testing = pd.read_csv('test_data_CAC_2022.csv')

#Select the columns relevant for our model.
test_data = pd.DataFrame(data_for_testing, columns= ['merchant_cat_code', 'coalesced_brand'])

#Import iso codes.
iso_codes = pd.read_csv('iso18245.csv')

#Display training data to ensure it was imported correctly.
training_data

In [None]:
#Display test data to ensure it was imported correctly.
test_data

In [None]:
#Display iso codes to ensure it was imported correctly.
#Remove all special characters from the text of the iso codes.
iso_codes.DESCRIPTION = iso_codes.DESCRIPTION.str.replace('/', ' ')
iso_codes.DESCRIPTION = iso_codes.DESCRIPTION.str.replace('[^\w\s]', ' ')
iso_codes.DESCRIPTION = iso_codes.DESCRIPTION.str.lower()
iso_codes

<h3>Data Prep<h3>

In [None]:
#Use for loops to navigate the merchant_cat_code colum in the training data to replace codes with a representing description. This is going to be achieved with using the function replace().
for iso_index in range(len(iso_codes.MCC)):

    for training_data_index in range(len(training_data.merchant_cat_code)):

        if iso_codes.MCC[iso_index] == training_data.merchant_cat_code[training_data_index]:
            training_data.merchant_cat_code.replace(iso_codes.MCC[iso_index], iso_codes.DESCRIPTION[iso_index], inplace=True)

#Replace NaN in the merchant_cat_code colum to ensure consistency.
training_data['merchant_cat_code'] = training_data['merchant_cat_code'].fillna('the')

#Display training data to ensure the replacements where done properly.
training_data

In [None]:
#Use for loops to navigate the merchant_cat_code colum in the test data to replace codes with a representing description. This is going to be achieved with using the function replace().
for iso_index in range(len(iso_codes.MCC)):

    for test_data_index in range(len(test_data.merchant_cat_code)):

        if iso_codes.MCC[iso_index] == test_data.merchant_cat_code[test_data_index]:
            test_data.merchant_cat_code.replace(iso_codes.MCC[iso_index], iso_codes.DESCRIPTION[iso_index], inplace=True)

#Replace NaN in the merchant_cat_code colum to ensure consistency.
test_data['merchant_cat_code'] = test_data['merchant_cat_code'].fillna('the')

#Display training data to ensure the replacements where done properly.
test_data

In [None]:
#Combine merchant_cat_code and coalesced_brand columns in the training data into the training_text colum in the training data. This is done to ensure that all the necessary text to train our model is only in one colum.
training_data['training_text'] = training_data['merchant_cat_code'].astype(str) + ' ' + training_data['coalesced_brand']

#Copy only the required columns from the training data to a new data frame. This ensures no contamination from other columns when training our model.
final_training_data = training_data[['training_text', 'category']].copy()

#Clean
final_training_data.training_text = final_training_data.training_text.str.replace('/', ' ')
final_training_data.training_text = final_training_data.training_text.str.replace('[^\w\s]', ' ')
final_training_data.training_text = final_training_data.training_text.str.lower()

#Display final training data to ensure only the correct columns where copy.
final_training_data

In [None]:
#Combine merchant_cat_code and coalesced_brand columns in the test data into the test_text colum in the test data. This is done to ensure that all the necessary text to test our model is only in one colum.
test_data['test_text'] = test_data['merchant_cat_code'].astype(str) + ' ' + test_data['coalesced_brand']

#Copy only the required columns from the test data to a new data frame. This ensures no contamination from other columns when testing our model.
final_test_data = test_data[['test_text']].copy()

#Clean
final_test_data.test_text = final_test_data.test_text.str.replace('/', ' ')
final_test_data.test_text = final_test_data.test_text.str.replace('[^\w\s]', ' ')
final_test_data.test_text = final_test_data.test_text.str.lower()

#Display final test data to ensure only the correct columns where copy.
final_test_data

In [None]:
#Assign a digit to a category.
final_training_data['category_id'] = final_training_data['category'].factorize()[0]

#Remove any duplicates from the data.
category_id_final_training_data = final_training_data[['category', 'category_id']].drop_duplicates().sort_values('category_id')

#Create a dictionary with string key data and digit value data.
category_to_id = dict(category_id_final_training_data.values)

#Create a dictionary with digit key data and string value data.
id_to_category = dict(category_id_final_training_data[['category_id', 'category']].values)

#Dislplay final training data to ensure is correct before extracting features from it.
final_training_data

<h3>Extract Features from Text<h3>

In [None]:
#Assing the vectorizer an easy to use variable name.
tfidf = TfidfVectorizer(sublinear_tf=True, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

#Learn vocabulary and idf form the training text that would use to determined features; return document-term matrix to assigned variable.
features_training = tfidf.fit_transform(final_training_data.training_text).toarray()

#Assign final training data categorical data to a easy to use variable.
labels_training = final_training_data.category_id

#Display the shape of the features training learn vocabulary.
features_training.shape

In [None]:
#Use the already learn vocabulary to transform test text to a document-term matrix.
features_test = tfidf.transform(final_test_data.test_text).toarray()

#Display the shape of the features test learn vocabulary.
features_test.shape

<h3>Create and Run Model<h3>

In [None]:
#Give the function model an easy to use variable name.
model = LinearSVC()

#Assing easy to understand variables names to the data been use on the model.
X_train = features_training
y_train = labels_training
X_test = features_test

#Fit the model according to the given training data.
model.fit(X_train, y_train)

#Predict class labels for the given test data.
y_prediction = model.predict(X_test)

<h3>Export Data<h3>

In [None]:
#Import test data to create a final data frame that includes the solutions to the test data.
final_data = pd.read_csv(r'C:\Users\imtec\OneDrive - University of Texas at El Paso\Personal Projects\Wells Fargo Challange\test_data_CAC_2022.csv')

#Give all the empty cells a digit to better assign data to it.
final_data['Category'] = final_data['Category'].fillna(0)

#Assing the predicted data to is corresponding row.
final_data['Category'] = y_prediction

#Use loops to replace the digits from the predicted model into a string category that is easier for the user to understand.
for final_data_index in range(len(final_data.Category)):
    for key, value in id_to_category.items():
        if final_data.Category[final_data_index] == key:
            final_data.Category.replace(key, value, inplace=True)

#Export the final dataframe into a csv file for better access to the data.
final_data.to_csv(r'C:\Users\imtec\OneDrive - University of Texas at El Paso\Personal Projects\Wells Fargo Challange\final_data_CAC_2022_solution1.csv')

<h3>Extract Visuals<h3>

In [None]:
fig = plt.figure(figsize=(8,6))

final_training_data.groupby('category').training_text.count().plot.bar(ylim=0)

plt.show()

In [None]:
visual_model = LinearSVC()

visual_X_train, visual_X_test, visual_y_train, visual_y_test, visual_indices_train, visual_indices_test = train_test_split(features_training, labels_training, final_training_data.index, test_size=0.33, random_state=0)
visual_model.fit(visual_X_train, visual_y_train)
y_pred = visual_model.predict(visual_X_test)

conf_mat = confusion_matrix(visual_y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels=category_id_final_training_data.category.values, yticklabels=category_id_final_training_data.category.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
print(metrics.classification_report(visual_y_test, y_pred, target_names=final_training_data['category'].unique()))