In [536]:
# Import libraries
import numpy as np
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import StratifiedKFold,train_test_split
from sklearn.metrics import recall_score

In [537]:
# Setting dictionary for industries and keywords for decision makers
indu_dict=    {'Accounting': "Financials",
                'Airlines/Aviation': "Consumer Discretionary",
                'Alternative Dispute Resolution': "Other",
                'Alternative Medicine': "Health Care",
                'Apparel & Fashion': "Consumer Discretionary",
                'Architecture & Planning': "Industrial",
                'Arts and Crafts': "Consumer Discretionary",
                'Automotive': "Industrial",
                'Aviation & Aerospace': "Industrial",
                'Banking': "Financials",
                'Biotechnology': "Health Care",
                'Broadcast Media': "Comm Services",
                'Building Materials': "Industrial",
                'Business Supplies and Equipment': "Industrial",
                'Capital Markets': "Financials",
                'Chemicals': "Industrial",
                'Civic & Social Organization': "Other",
                'Civil Engineering': "Industrial",
                'Commercial Real Estate': "Real Estate",
                'Computer & Network Security': "IT",
                'Computer Games': "IT",
                'Computer Hardware': "IT",
                'Computer Networking': "IT",
                'Computer Software': "IT",
                'Construction': "Industrial",
                'Consumer Electronics': "Consumer Discretionary",
                'Consumer Goods': "Consumer Discretionary",
                'Consumer Services': "Consumer Discretionary",
                'Cosmetics': "Consumer Discretionary",
                'Dairy': "Consumer Staples",
                'Defense & Space': "Industrial",
                'Design': "Industrial",
                'E-Learning': "IT",
                'E-learning': "IT",
                'Education Management': "Other",
                'Educational Institution': "Other",
                'Electrical/Electronic Manufacturing': "Industrial",
                'Entertainment': "Consumer Discretionary",
                'Environmental Services': "Industrial",
                'Events Services': "Other",
                'Executive Office': "Industrial",
                'Facilities Services': "Industrial",
                'Farming': "Consumer Staples",
                'Financial Services': "Financials",
                'Fine Art': "Other",
                'Fishery': "Consumer Staples",
                'Food & Beverages': "Consumer Staples",
                'Food Production': "Consumer Staples",
                'Fund-Raising': "Financials",
                'Furniture': "Industrial",
                'Gambling & Casinos': "Consumer Discretionary",
                'Glass Ceramics & Concrete': "Industrial",
                'Government Administration': "Public",
                'Government Relations': "Public",
                'Graphic Design': "Comm Services",
                'Health Wellness and Fitness': "Health Care",
                'Higher Education': "Other",
                'Hospital & Health Care': "Health Care",
                'Hospitality': "Consumer Discretionary",
                'Human Resources': "Other",
                'Import and Export': "Financials",
                'Individual & Family Services': "Other",
                'Industrial Automation': "Industrial",
                'Information Services': "IT",
                'Information Technology and Services': "IT",
                'Insurance': "Financials",
                'International Affairs': "Other",
                'International Trade and Development': "Financials",
                'Internet': "IT",
                'Investment Banking': "Financials",
                'Investment Management': "Financials",
                'Judiciary': "Other",
                'Law Enforcement': "Public",
                'Law Practice': "Other",
                'Legal Services': "Other",
                'Leisure Travel & Tourism': "Consumer Discretionary",
                'Libraries': "Consumer Discretionary",
                'Logistics and Supply Chain': "Industrial",
                'Luxury Goods & Jewelry': "Consumer Discretionary",
                'Machinery': "Industrial",
                'Management Consulting': "Industrial",
                'Maritime': "Industrial",
                'Market Research': "Financials",
                'Marketing and Advertising': "Consumer Discretionary",
                'Mechanical Or Industrial Engineering': "Industrial",
                'Mechanical or Industrial Engineering': "Industrial",
                'Media Production': "Comm Services",
                'Medical Devices': "Health Care",
                'Medical Practice': "Health Care",
                'Mental Health Care': "Health Care",
                'Military': "Public",
                'Mining & Metals': "Industrial",
                'Motion Pictures and Film': "Consumer Discretionary",
                'Museums and Institutions': "Consumer Discretionary",
                'Music': "Consumer Discretionary",
                'Nanotechnology': "Industrial",
                'Newspapers': "Comm Services",
                'Nonprofit Organization Management': "Other",
                'Oil & Energy': "Industrial",
                'Online Media': "Comm Services",
                'Outsourcing/Offshoring': "Other",
                'Package/Freight Delivery': "Industrial",
                'Packaging and Containers': "Industrial",
                'Paper & Forest Products': "Industrial",
                'Performing Arts': "Consumer Discretionary",
                'Pharmaceuticals': "Health Care",
                'Philanthropy': "Other",
                'Photography': "Consumer Discretionary",
                'Plastics': "Industrial",
                'Primary/Secondary Education': "Other",
                'Printing': "Comm Services",
                'Privately Held': "Other",
                'Professional Training & Coaching': "Other",
                'Program Development': "IT",
                'Public Policy': "Public",
                'Public Relations and Communications': "Other",
                'Public Safety': "Public",
                'Publishing': "Comm Services",
                'Railroad Manufacture': "Industrial",
                'Ranching': "Consumer Staples",
                'Real Estate': "Real Estate",
                'Recreational Facilities and Services': "Consumer Discretionary",
                'Religious Institutions': "Other",
                'Renewables & Environment': "Industrial",
                'Research': "Financials",
                'Restaurants': "Consumer Discretionary",
                'Retail': "Consumer Discretionary",
                'Security and Investigations': "Other",
                'Semiconductors': "Industrial",
                'Shipbuilding': "Industrial",
                'Sporting Goods': "Consumer Discretionary",
                'Sports': "Consumer Discretionary",
                'Staffing and Recruiting': "Industrial",
                'Supermarkets': "Consumer Staples",
                'TBD': "Other",
                'Telecommunications': "Comm Services",
                'Textiles': "Industrial",
                'Tobacco': "Consumer Staples",
                'Translation and Localization': "Industrial",
                'Transportation/Trucking/Railroad': "Industrial",
                'Utilities': "Industrial",
                'Venture Capital & Private Equity': "Financials",
                'Veterinary': "Health Care",
                'Warehousing': "Industrial",
                'Wholesale': "Consumer Staples",
                'Wine and Spirits': "Consumer Staples",
                'Wireless': "Comm Services",
                'Writing and Editing': "Industrial"}

upper_management = ['president','président','owner','ceo','founder','director','principal','chief','CFO','CTO','COO','VP','Chairman','Board','Executive','CIO','Head','c.e.o.']
middle_management = ['Manager','Account','Coordinator','Lead','Head','Managing Member','Leader','development','advisor']
lower_management = ['Partner','Consultant','Engineer','Associate', 'Specialist', 'Recruiter','Senior','Developer','Instructor','Designer','writer','instructor','analyst','staff','team member','scrum']
upper_management = [x.lower() for x in upper_management]
middle_management = [x.lower() for x in middle_management]
lower_management = [x.lower() for x in lower_management]



In [538]:
### Function for processing the datasets
def Dataset_process(url,create_target_flag,Train_flag):
  df = pd.read_csv(url)
  # Map industry dictionary
  df = df.replace({'kw_industry':indu_dict })
  # Convert Dates to correct format
  if(Train_flag):
    df['campaign_bulk_date'] = pd.to_datetime(df['campaign_bulk_date']).dt.strftime("%d-%m-%Y")
  # Fill blanks in job title, lowercase and strip spaces
  df['title'] = df['title'].fillna('Not Defined')
  df['title'] = df['title'].str.lower()
  df['title'] = df['title'].str.strip()
  # Reduce title categories by decision making capabilities
  df.loc[df['title'].str.contains('|'.join(upper_management)),'title'] = "Upper Management"
  df.loc[df['title'].str.contains('|'.join(middle_management)),'title'] = "Middle Management"
  df.loc[df['title'].str.contains('|'.join(lower_management)),'title'] = "Lower Management"
  df.loc[~df['title'].str.contains("Upper Management|Middle Management|Lower Management|not defined"),'title'] = "Others"
  # As the target variable is not present in the dataset, we must create it ( only in the training dataset)
  if(create_target_flag):
    df = df.sort_values(by=['score','Response'], ascending= (True,False)).reset_index(drop=True)
    df['score'].fillna(0,inplace=True)
    # Initialize targetvariable 
    df['increased_value_flag'] = np.nan
    # For each idlead, i will see if the contact increased business value and assign the flag value
    unique_idlead = df['idlead'].unique()
    for id in unique_idlead:
      Index = df.index[df['idlead'] == id].tolist()
      max_score = 0.0
      for i in Index:
        if (df.iloc[i,7] > max_score):
          max_score = df.iloc[i,7]
          df.iloc[i,10] = 1
        else:
          df.iloc[i,10] = 0
  return(df)



In [539]:
Train = Dataset_process('https://raw.githubusercontent.com/Urobhi/BairesDev/master/DstCH/train.csv',False,True)
Train.to_csv('Train_clean.csv')
Test = Dataset_process('https://raw.githubusercontent.com/Urobhi/BairesDev/master/DstCH/test.csv',False,False)
Test.to_csv('Test_cleaned.csv')


In [540]:
# Load training and predicion datasets, doing some tiny data engineering
df_leads = pd.read_csv('https://raw.githubusercontent.com/Urobhi/BairesDev/master/DstCH/Train_clean.csv')
df_leads = df_leads.drop(['idlead','Response','city','campaign_bulk_date','score'],axis=1)
df_leads['state'] = df_leads['state'].fillna('Unkown')
df_leads['increased_value_flag'] = df_leads['increased_value_flag'].astype('category')
df_leads['state'].value_counts()
df_leads=pd.get_dummies(data=df_leads, columns = ['state','country','kw_industry','kw_size','title'])

df_leads_topredict = pd.read_csv('https://raw.githubusercontent.com/Urobhi/BairesDev/master/DstCH/Test_clean.csv')
df_leads_topredict['state'] = df_leads_topredict['state'].fillna('Unkown')




In [541]:
# Use independant variables for training
X= df_leads.drop('increased_value_flag',axis=1)
# Set target variable for training
Y= df_leads['increased_value_flag']

# Set variables used in the dataset that i need to predict
X_topredict = df_leads_topredict.drop(['idlead','city','Response'],axis=1)
X_topredict=pd.get_dummies(data= X_topredict, columns = ['state','country','kw_industry','kw_size','title'])

# align training and prediction dataset to avoid errors in one hot encoding
X, X_topredict = X.align(X_topredict, join='inner', axis=1)

# Divide training set into Train and validation
X_train,X_val,Y_train,Y_val = train_test_split(X,Y, test_size=0.3,random_state=2,stratify=Y)

In [542]:
 # Initialize and fit model (using default parameters due tu lack of time and resources)
clf = RandomForestClassifier()
clf.fit(X_train,Y_train)

# Train
pred_train = clf.predict(X_train)
# Should have only two unique outputs
print(np.unique(pred_train))
#Predict validation dataset
pred_val = clf.predict(X_val)
# Calculate recall in training and testing
train_recall = recall_score(pred_train,Y_train)
val_recall = recall_score(pred_val,Y_val)
print(train_recall)
print(val_recall)

[0 1]
0.9367088607594937
0.7340425531914894


In [544]:
df_leads_topredict.to_csv('Results.csv')