In [1]:
import numpy as np
import pandas as pd
import random
import os
import sys

In [2]:
# # data_loc = "C:\Users\russe\Desktop\adult.data"
# columns = ["age","workclass","fnlwgt","education","education-num","marital-status","occupation",
#            "relationship","race","sex","capital-gain","capital-loss","hours-per-week","country"]

# data = pd.read_csv("data/Datasets/Recommender Systems/adult.data",names=columns,index_col=False,na_values='?')
# print(data.isin(['?']).any(),"\n")
# data.head()

In [24]:
age_floor,age_ceiling = 20,65
age_list = [x for x in range(age_floor,age_ceiling)]
age_list.append('Unknown')

gender_list = ['Male','Female','Other','Unknown']

job_sector_list = ["Agriculture","Manufacturing","Public administration","Other services","Forestry, fishing, mining, oil and gas","Utilities","Transportation/warehousing",
                   "Trade","Finance, insurance, real estate and leasing","Accommodation and food services","Educational services","Information, culture and recreation",
                   "Construction","Health care and social assistance","Professional, scientific and technical services","Business, building and other support services","Other","Unknown"]

occupation_list = ["Dentist","Registered Nurse","Pharmacist","Computer Systems Analyst","Physician","Database Administrator","Software Developer","Physical Therapist",
                   "Web Developer","Dental Hygienist","Occupational Therapist","Veterinarian","Computer Programmer","School Psychologist","Physical Therapist Assistant",
                   "Interpreter & Translator","Mechanical Engineer","Veterinary Technologist & Technician","Epidemiologist","IT Manager","Maintenance & Repair Worker",
                   "Market Research Analyst","Diagnostic Medical Sonographer","Computer Systems Administrator","Respiratory Therapist","Medical Secretary","Civil Engineer",
                   "Substance Abuse Counselor","Speech-Language Pathologist","Landscaper & Groundskeeper","Radiologic Technologist","Cost Estimator","Financial Advisor",
                   "Marriage & Family Therapist","Medical Assistant","Lawyer","Accountant","Compliance Officer","High School Teacher","Clinical Laboratory Technician",
                   "Bookkeeping, Accounting, & Audit Clerk","Financial Manager","Recreation & Fitness Worker","Insurance Agent","Elementary School Teacher","Dental Assistant",
                   "Management Analyst","Home Health Aide","Pharmacy Technician","Construction Manager","Public Relations Specialist","Middle School Teacher","Massage Therapist",
                   "Paramedic","Preschool Teacher","Hairdresser","Marketing Manager","Patrol Officer","School Counselor","Executive Assistant","Financial Analyst",
                   "Personal Care Aide","Clinical Social Worker","Business Operations Manager","Loan Officer","Meeting, Convention & Event Planner","Mental Health Counselor",
                   "Sales Representative","Architect","Sales Manager","HR Specialist","Plumber","Real Estate Agent","Glazier","Art Director","Customer Service Representative",
                   "Logistician","Auto Mechanic","Bus Driver","Restaurant Cook","Child & Family Social Worker","Administrative Assistant","Receptionist","Paralegal",
                   "Cement Mason & Concrete Finisher","Painter","Sports Coach","Teacher Assistant","Brickmason & Blockmason","Cashier","Janitor","Electrician",
                   "Delivery Truck Driver","Maid & Housekeeper","Carpenter","Security Guard","Construction Worker","Fabricator","Telemarketer","Nursing Aide","Other","Unknown"]

education_list = ['Bachelors','Masters','Doctorate','Some-college','College Diploma','Vocational Diploma','Highschool','Preschool',"Other",'Unknown']

marital_status_list = ['Single','Married','Separated','Widowed',"Other",'Unknown']

country_list = ['Sweden','Germany','Mexico','Japan','Indonesia','Russia','India','China','Africa','France','Spain','England','Canada','USA','Trinidad','Australia',"Other",'Unknown']

mappings = {'Age': age_list,
            'Gender': gender_list,
            'Job_Sector': job_sector_list,
            'Occupation': occupation_list,
            'Education': education_list,
            'Marital-Status': marital_status_list,
            'Country': country_list}

In [26]:
#function to create dataframe that is n-rows long. Randomly samples feature-data from respective dictionary[key] list
def create_dataframe(mapping_dict,size,random_seed=None):
    
    #Establish random generator seed for reproducable data
    rng = np.random.RandomState(random_seed)
    
    data_list = []
    # Randomly select feature values per column
    [data_list.append([rng.choice(mapping_dict[key]) for key in mapping_dict.keys()]) for i in range(size)]

    data_df = pd.DataFrame(data_list, columns=mappings.keys())
    print(f"DataFrame Construction Complete\nCurrent Size:{len(data_df)}")
    return data_df

In [43]:
# create dataframe
df = create_dataframe(mappings,size=400000,random_seed=2021)
df.head()

DataFrame Construction Complete
Current Size:400000


Unnamed: 0,Age,Gender,Job_Sector,Occupation,Education,Marital-Status,Country
0,41,Female,Agriculture,Carpenter,Highschool,Other,Canada
1,26,Other,Transportation/warehousing,Marriage & Family Therapist,Vocational Diploma,Married,Trinidad
2,53,Male,Utilities,Civil Engineer,Doctorate,Single,Japan
3,36,Other,Utilities,Construction Manager,Highschool,Separated,Trinidad
4,27,Female,Transportation/warehousing,Financial Manager,Vocational Diploma,Married,Spain


In [6]:
# def create_mappings(df):
#     conv_list = {}
#     for feature in df.columns.unique():
#         for var in df[feature].unique():
#             if var == 'Unknown':
#                 conv_list[var] = float(0)
#             else:
#                 conv_list[var] = float(len(df[df[feature]==var])/len(df[feature]))
#         print(f"Mapping {feature} Completed")
#     return conv_list

# def generate_rating_matrix(x,map_dict):
#     val_list = []
#     for item in x.values:
#         if item in map_dict.keys():
#             val_list.append(map_dict[item])
#         else:
#             val_list.append('Unknown')
#     return val_list

# df_mapping = create_mappings(data_df)
# matrix_df = data_df.copy()
# matrix_df['Rating'] = data_df.apply(generate_rating_matrix, args=[df_mapping], axis=1)
# print('Matrix Generation Completed')
# matrix_df.head()

## Separate data rows where 'Unknown' value is present in > 70% of columns

In [44]:
# Keep only the rows where user data is present > 75% filled in
data_df = df[(df == 'Unknown').sum(axis=1) < df.shape[1] * 0.75]

# Create separate dataframe with rows < 75% data filled in
low_data_df = df[(df == 'Unknown').sum(axis=1) > df.shape[1] * 0.75]

## Data & Text Preprocessing

In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import unicodedata
import string
import re
import time

In [8]:
# Process and Convert Dataframe column types into 'Category' Type for faster processing
def manage_dtypes(df):
    # Get integer-based columns & convert each value to string-type
    numeric_cols = df.select_dtypes(include=['int', 'int64', np.number]).columns.to_list()
    df[numeric_cols]=df[numeric_cols].astype('str')
    
    # Get string-based columns
    alpha_cols=df.select_dtypes(exclude=['int64','int32']).columns.to_list()
    
    # convert all string-type columns to categorical for faster processing
    df[alpha_cols] = df[alpha_cols].astype('category')
    df[numeric_cols]=df[numeric_cols].astype('category')
    return df

# Function to compile data along a row into single string
def generate_soup(df_row):
    val_list = df_row.values.tolist()
    val_list = " ".join(val_list)
    return val_list

In [9]:
# Process columns to category type
data_df = manage_dtypes(data_df)
# clone dataframe
soup_df = data_df.copy()
# compile data along rows into new column
soup_df['soup'] = soup_df.apply(generate_soup, axis=1)
print('Soup is Ready\n')

Soup is Ready



In [10]:
# import dask.dataframe as dd
# from dask.multiprocessing import get
# import time

# def dask_this(df_series):
#     result = df_series.apply(process_text)
#     return result

# ddata = dd.from_pandas(soup_df['soup'], npartitions=10)
# res = ddata.map_partitions(dask_this).compute(scheduler='processes', num_workers=10)
# soup_df['dask'] = res

In [11]:
# # convert number into words
# import inflect
# p = inflect.engine()

# def convert_number(text):
#     # split string into list of words
#     temp_str = text.split()
#     # initialise empty list
#     new_string = []
  
#     for word in temp_str:
#         # if word is a digit, convert the digit to numbers and append into the new_string list
#         if word.isdigit():
#             temp = p.number_to_words(word)
#             new_string.append(temp)
  
#         # append the word as it is
#         else:
#             new_string.append(word)
  
#     # join words together into new string
#     temp_str = " ".join(new_string)
#     return temp_str

In [12]:
# remove stopwords function
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = " ".join([word for word in word_tokens if word not in stop_words])
    return filtered_text

# Run preprocessing
def process_text(text):
    # Remove stopwords
    text_nostop = remove_stopwords(text)
    # Remove accented characters
    text_noacc = unicodedata.normalize('NFKD', text_nostop).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # Lowercase text
    text_lower = text_noacc.lower()
    # Special Character Removal
    text_nospec = re.sub('[^A-Za-z0-9 ]+', '', text_lower)
    # Extra whitespace removal
    text_nospace = " ".join(text_nospec.split())
    # Punctuation removal
    translator = str.maketrans('', '', string.punctuation)
    text_cleaned = text_nospace.translate(translator)
    return text_cleaned

In [16]:
start = time.time()
# Apply text preprocessing to all rows in specified column
soup_df['clean_text'] = soup_df['soup'].apply(process_text)
print("--- %s seconds ---" % (time.time() - start))

--- 104.51848268508911 seconds ---


In [17]:
soup_df

Unnamed: 0,Age,Gender,Job_Sector,Occupation,Education,Marital-Status,Country,soup,clean_text
0,41,Female,Agriculture,Carpenter,Highschool,Other,Canada,41 Female Agriculture Carpenter Highschool Oth...,41 female agriculture carpenter highschool oth...
1,26,Other,Transportation/warehousing,Marriage & Family Therapist,Vocational Diploma,Married,Trinidad,26 Other Transportation/warehousing Marriage &...,26 other transportationwarehousing marriage fa...
2,53,Male,Utilities,Civil Engineer,Doctorate,Single,Japan,53 Male Utilities Civil Engineer Doctorate Sin...,53 male utilities civil engineer doctorate sin...
3,36,Other,Utilities,Construction Manager,Highschool,Separated,Trinidad,36 Other Utilities Construction Manager Highsc...,36 other utilities construction manager highsc...
4,27,Female,Transportation/warehousing,Financial Manager,Vocational Diploma,Married,Spain,27 Female Transportation/warehousing Financial...,27 female transportationwarehousing financial ...
...,...,...,...,...,...,...,...,...,...
399995,39,Unknown,Construction,Dentist,Bachelors,Separated,Russia,39 Unknown Construction Dentist Bachelors Sepa...,39 unknown construction dentist bachelors sepa...
399996,30,Other,Health care and social assistance,Child & Family Social Worker,Masters,Separated,Germany,30 Other Health care and social assistance Chi...,30 other health care social assistance child f...
399997,32,Female,Accommodation and food services,Software Developer,Masters,Single,Germany,32 Female Accommodation and food services Soft...,32 female accommodation food services software...
399998,52,Unknown,Construction,Database Administrator,College Diploma,Separated,Sweden,52 Unknown Construction Database Administrator...,52 unknown construction database administrator...


## CountVectorizer/Tf-IDF(use IDF=False) Use-Case for Building Recommender System