In [12]:
import pandas as pd

data = {'MCC': [5812, 5812, 5813, 5813, 4829, 4829, 5411, 5411, 6513, 6513, 4121, 4121, 5732, 5732, 6012, 6012, 6011, 6011, 5812, 5812, 6300, 6300, 4900, 4900, 5967, 5967, 6012, 6012, 5712, 5712, 6211, 6211, 5541, 5541, 7997, 7997, 5812, 5812, 5499, 5499, 5311, 5311],
        'Merchant_Name': ['Restaurants, Fast Food', 'Starbucks Coffee', 'Movie Theaters, Streaming Services', 'Netflix Subscription', 'Wire Transfers, Money Transfer', 'Western Union Transfer', 'Supermarkets, Grocery Stores', 'Whole Foods Market', 'Real Estate Agents, Rental Properties', 'Airbnb Rental', 'Taxicabs, Limousines', 'Uber Ride', 'Electronics Stores', 'Best Buy Electronics', 'Financial Institutions, Banks', 'Chase Bank Withdrawal', 'Automated Cash Disburse', 'ATM Withdrawal - Bank of America', 'Restaurants, Sit-Down Dining', 'Olive Garden', 'Insurance Sales, Underwriting', 'State Farm Insurance', 'Utilities, Electric, Gas, Sanitary, Water', 'Con Edison Bill Payment', 'Direct Marketing - Subscription Services', 'Amazon Prime Subscription', 'Loan Payments, Financial Services', 'Student Loan Payment', 'Furniture, Home Furnishings Stores', 'IKEA Furniture', 'Securities Brokers, Dealers', 'Charles Schwab Investment', 'Service Stations (with or without ancillary services)', 'Shell Gas Station', 'Membership Clubs, Gyms', 'Planet Fitness Membership', 'Dining', 'Chipotle Mexican Grill', 'Miscellaneous Food Stores', "Trader Joe's", 'Department Stores', "Macy's"]}
df = pd.DataFrame(data)

In [13]:
df

Unnamed: 0,MCC,Merchant_Name
0,5812,"Restaurants, Fast Food"
1,5812,Starbucks Coffee
2,5813,"Movie Theaters, Streaming Services"
3,5813,Netflix Subscription
4,4829,"Wire Transfers, Money Transfer"
5,4829,Western Union Transfer
6,5411,"Supermarkets, Grocery Stores"
7,5411,Whole Foods Market
8,6513,"Real Estate Agents, Rental Properties"
9,6513,Airbnb Rental


# Data Preprocessing

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# Load data
df = pd.read_csv('data/mcc_merchant_data.csv') #/data/mcc_merchant_data.csv

# Basic Cleaning
df.dropna(inplace=True)  # Drop missing values
df.drop_duplicates(inplace=True)
df['Merchant_Name'] = df['Merchant_Name'].str.lower()

# Tokenization and Lemmatization
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

df['Merchant_Name'] = df['Merchant_Name'].apply(preprocess_text)

# Encoding MCC
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['MCC'] = le.fit_transform(df['MCC'])

# Splitting Data
X = df['Merchant_Name']
y = df['MCC']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


[nltk_data] Downloading package stopwords to C:\Users\M Rafay
[nltk_data]     Shaikh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\M Rafay
[nltk_data]     Shaikh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Model Development

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Creating a pipeline (combining vectorization and model)
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),  
    ('clf', SVC(kernel='linear', C=1.0))
])

# Train the model
model.fit(X_train, y_train)


# 03_model_evaluation

In [11]:
# Predicting
y_pred = model.predict(X_test)

# Evaluation
accuracy = accuracy_score(y_test, y_pred)
print(f'Model Accuracy: {accuracy:.2f}')

Model Accuracy: 0.33
