In [1]:
import csv
from datetime import datetime
import psycopg2
import numpy as np
import string
from tqdm import tqdm
from scipy.sparse import csr_matrix
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
start_time = datetime(2016, 10, 16)
def process_csv():
    output = []
    reader = csv.DictReader(open("loans_for_tagging.csv"))
    for row in reader:
        assigned = row["assigned_on"]
        if assigned != "NULL":
            time = datetime.strptime(assigned, "%Y-%m-%d %H:%M:%S")
            if time > start_time:
                output.append(int(row["loan_id"]))
    return output
result = process_csv()

In [3]:
print(len(result))

91380


In [4]:
conn = psycopg2.connect("dbname=autotag user=postgres host='localhost' password='" + os.environ["POSTGRES_PASSWORD"] + "'")
cur = conn.cursor()

In [5]:
columns = ["id","name","original_language","original_description","translated_description",
           "funded_amount","loan_amount","status","image_id","video_id","activity","sector",
           "use","country_code","country_name","town","currency_policy","currency_exchange_coverage_rate",
           "currency","partner_id","posted_time","planned_expiration_time","disbursed_time","funded_time",
           "term_in_months","lender_count","journal_entries_count","bulk_journal_entries_count","tags",
           "borrower_names","borrower_genders","borrower_pictured","repayment_interval","distribution_model"]

categorical = ["activity", "sector", "country_code", "town", "partner_id", "repayment_interval"]
numeric = ["loan_amount", "term_in_months"]

final_data = {
    "categorical": [],
    "numeric": [],
    "use": [],
    "description": [],
    "tags": [],
    "ids": []
}

for loan_id in tqdm(result):
    cur.execute("SELECT * FROM autotag WHERE id = %s", (loan_id,))
    raw = cur.fetchone()
    if raw is None:
        continue
    
    categorical_data = []
    for feature in categorical:
        categorical_data.append(str(raw[columns.index(feature)]))
    
    numeric_data = []
    for feature in numeric:
        numeric_data.append(float(raw[columns.index(feature)]))
    
    genders = raw[columns.index("borrower_genders")]
    if genders is not None:
        genders = genders.split(", ")
        female = 0.
        for gender in genders:
            if gender == "female":
                female += 1
        numeric_data.append(female/len(genders))
    else:
        continue
        
    use = raw[columns.index("use")]
    if use is None:
        continue
    
    description = raw[columns.index("translated_description")]
    if description is None:
        continue
    
    tags = raw[columns.index("tags")]
    final_tags = []
    if tags is not None:
        for tag in tags.split(", "):
            if tag[0] == "#":
                final_tags.append(tag)
        
    
    final_data["categorical"].append(categorical_data)
    final_data["numeric"].append(numeric_data)
    final_data["use"].append(use)
    final_data["description"].append(description)
    final_data["ids"].append(raw[0])
    final_data["tags"].append(final_tags)

100%|██████████| 91380/91380 [06:01<00:00, 252.96it/s] 


In [6]:
for key in final_data:
    final_data[key] = np.array(final_data[key])

In [7]:
print(final_data["tags"])

[list(['#Supporting Family', '#Repeat Borrower'])
 list(['#Eco-friendly', '#Biz Durable Asset', '#Technology', '#Schooling', '#Parent', '#Woman Owned Biz'])
 list(['#Eco-friendly', '#Schooling', '#Technology', '#Biz Durable Asset', '#Parent', '#Woman Owned Biz'])
 ..., list([]) list([]) list([])]


In [10]:
# def scan_text(text_list):
#     words = set()
#     for text in text_list:
#         current_word = ""
#         for character in text:
#             if character in string.whitespace or character in string.punctuation:
#                 words.add(current_word)
#                 current_word = ""
#                 if character in string.punctuation:
#                     words.add(character)
#             else:                
#                 current_word += character.lower()
#     words.add(current_word)
#     return list(words)
            
# use_words = scan_text(final_data["use"].flatten())
# description_words = scan_text(final_data["description"].flatten())

In [11]:
# def vectorize(full_text, word_list, max_length):
#     output = []
#     for text in tqdm(full_text):
#         vectorized = []
#         current_word = ""
#         for character in text:
#             if character in string.whitespace or character in string.punctuation:
#                 try:
#                     index = word_list.index(current_word)
#                 except ValueError:
#                     index = -1
#                 vectorized.append(index+1)
#                 current_word = ""
#                 if character in string.punctuation:
#                     try:
#                         index = word_list.index(character)
#                     except ValueError:
#                         index = -1
#                     vectorized.append(index+1)
#             else:
#                 current_word += character.lower()
                
#         try:
#             index = word_list.index(current_word)
#         except ValueError:
#             index = -1
#         vectorized.append(index+1)
        
#         if len(vectorized) > max_length:
#             print(len(vectorized))
#         for i in range(len(vectorized), max_length):
#             vectorized.append(0)
#         output.append(np.array(vectorized))
#     return np.array(output)

In [8]:
processed_data = {}

In [41]:
use_vect = TfidfVectorizer(ngram_range=(1,10000), stop_words='english', min_df=3)
processed_data["use"] = use_vect.fit_transform(final_data["use"])
print(processed_data["use"].shape)

(76953, 62691)


In [None]:
description_vect = TfidfVectorizer(ngram_range=(1,4), stop_words='english', min_df=3)
processed_data["description"] = description_vect.fit_transform(final_data["description"])
print(processed_data["description"].shape)
# 16.6

In [32]:
categorical_dictionaries = []
for record in final_data["categorical"]:
    dictionary = {}
    for i, feature in enumerate(record):
        dictionary[i] = feature
    categorical_dictionaries.append(dictionary)
vect = DictVectorizer()
categorical = vect.fit_transform(categorical_dictionaries).toarray()

In [33]:
processed_data["base"] = np.hstack((categorical, final_data["numeric"]))

In [34]:
tags = final_data["tags"]
possible_tags = set()
for tag_list in tags:
    for tag in tag_list:
        possible_tags.add(tag)
possible_tags = list(possible_tags)
processed_tags = []
for tag_list in tags:
    empty = [0 for i in range(len(possible_tags))]
    for tag in tag_list:
        empty[possible_tags.index(tag)] = 1
    processed_tags.append(empty)
processed_tags = np.array(processed_tags)

In [35]:
print(processed_tags)

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 1 0]
 [0 0 0 ..., 0 1 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [36]:
processed_data["tags"] = processed_tags

In [37]:
print(processed_data)

{'use': <76953x645578 sparse matrix of type '<class 'numpy.float64'>'
	with 1963500 stored elements in Compressed Sparse Row format>, 'description': <76953x11158133 sparse matrix of type '<class 'numpy.float64'>'
	with 27016079 stored elements in Compressed Sparse Row format>, 'base': array([[  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          5.00000000e+03,   8.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          2.50000000e+02,   3.80000000e+01,   1.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          2.50000000e+02,   3.80000000e+01,   1.00000000e+00],
       ..., 
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          2.50000000e+01,   1.30000000e+01,   1.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
          2.50000000e+01,   1.30000000e+01,   1.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.000

In [39]:
pickle.dump(processed_data, open("processed_data.pkl", "wb"))
# pickle.dump(use_words, open("use_words.pkl", "wb"))
# pickle.dump(description_words, open("description_words.pkl", "wb"))
pickle.dump(vect, open("categorical_vect.pkl", "wb"))
pickle.dump(possible_tags, open("possible_tags.pkl", "wb"))
pickle.dump(description_vect, open("description_vect.pkl", "wb"))
pickle.dump(use_vect, open("use_vect.pkl", "wb"))