In [2]:
#Data Manipulation Libraries
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np

#ML Libraries
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss

#Booster
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier

#Visualization Libraries
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz as gp
from io import StringIO
from IPython.display import Image
import pydotplus

#Evaluation Libraries
from sklearn.model_selection import cross_val_score


In [3]:
#reads in file with prospect & training company information
c = pd.read_csv("") 

In [4]:
#Data Preprocessing
#Converts State names to acronyms
state_mapping = {'Alabama': 'AL','Alaska': 'AK','Arizona': 'AZ','Arkansas': 'AR','California': 'CA','Colorado': 'CO','Connecticut': 'CT','Delaware': 'DE','Florida': 'FL','Georgia': 'GA','Hawaii': 'HI','Idaho': 'ID','Illinois': 'IL','Indiana': 'IN','Iowa': 'IA','Kansas': 'KS','Kentucky': 'KY','Louisiana': 'LA','Maine': 'ME','Maryland': 'MD','Massachusetts': 'MA','Michigan': 'MI','Minnesota': 'MN','Mississippi': 'MS','Missouri': 'MO','Montana': 'MT','Nebraska': 'NE','Nevada': 'NV','New Hampshire': 'NH','New Jersey': 'NJ','New Mexico': 'NM','New York': 'NY','North Carolina': 'NC','North Dakota': 'ND','Ohio': 'OH','Oklahoma': 'OK','Oregon': 'OR','Pennsylvania': 'PA','Rhode Island': 'RI','South Carolina': 'SC','South Dakota': 'SD','Tennessee': 'TN','Texas': 'TX','Utah': 'UT','Vermont': 'VT','Virginia': 'VA','Washington': 'WA','West Virginia': 'WV','Wisconsin': 'WI','Wyoming': 'WY'}
c = c.replace({'State':state_mapping})

In [5]:
#Strips Employee count to only numbers
count = 0
for x in c.Employees:
    x = x.split()
    y = x[0]
    y = int(y)
    c.Employees[count] = y
    count = count + 1

In [6]:
#adds dummy variables for State
c = pd.get_dummies(c, columns=['State'])

In [7]:
#adds dummy variables for Industry
c = pd.get_dummies(c, columns=['Industry'])

In [8]:
#Sets name as Index
c = c.set_index('Name')

In [9]:
#changes years to age
count = 0
#current year
cy = 2020
for x in c.Year:
    x = int(x)
    x = cy - x
    if(x == 2020):
        x = 0
    c.Year[count] = x
    count = count + 1

In [10]:
#prints entire array to check
pd.set_option("display.max_rows", None, "display.max_columns", None)

print(c)

                                                  Geo_ID  Year Employees  \
Name                                                                       
3 Birds Marketing LLC                                  0    11        55   
9Gauge Partners, LLC                                   0     9        87   
Aceyus                                                 0    18        50   
Aesthetic Management Partners                          1     2        34   
Akos MD                                                0     4        32   
Altvia Solutions                                       0    14        33   
American Aerospace Technologies, Inc.                  0    18        13   
AmplifAI Solutions, Inc.                               0     6        27   
Arx Nimbus - Data-Driven Digital Risk Management       0     5        12   
Attribytes                                             0     5        27   
Avertra Corp                                           0    13       103   
Banyan Hills

In [11]:
#assign feature vars
#f_cols = ['Number_of_Employees_11-50 employees', 'Number_of_Employees_2-10 employees', 'Number_of_Employees_501-1,000 employees', 'Number_of_Employees_51-200 employees', 'State_AZ', 'State_CA', 'State_CO', 'State_FL', 'State_GA', 'State_ID', 'State_IL', 'State_KS', 'State_MD', 'State_MN', 'State_MO', 'State_MT', 'State_NC', 'State_NJ', 'State_NV', 'State_NY', 'State_OH', 'State_OR', 'State_PA', 'State_TN', 'State_TX', 'State_UT', 'State_VA', 'State_WA']
states = c.filter(like='State')
inds = c.filter(like='Industry')
numEmployees = c.Employees
yrs = c.Year
geo_ID = c.Geo_ID

f_cols = pd.concat([numEmployees, states, yrs, inds, geo_ID], axis=1)
X = f_cols
#assign target vars
Y = c.Qualification

In [12]:
#data split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.3, random_state=1)

In [13]:
#runs regression
regr_1 = AdaBoostRegressor(DecisionTreeRegressor(criterion='mse', max_depth = 2), n_estimators=300)
regr_2 = GradientBoostingClassifier(criterion='mse', n_estimators=300, learning_rate=.1, max_depth=2)
#regr_3 = AdaBoostRegressor(DecisionTreeRegressor(criterion='mse'), n_estimators=300)
#regr_4 = GradientBoostingRegressor(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
regr_1.fit(x_train, y_train)
regr_2.fit(x_train, y_train)
#regr_3.fit(x_train, y_train)
#regr_4.fit(x_train, y_train)

GradientBoostingClassifier(criterion='mse', init=None, learning_rate=0.1,
                           loss='deviance', max_depth=2, max_features=None,
                           max_leaf_nodes=None, min_impurity_decrease=0.0,
                           min_impurity_split=None, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=300, n_iter_no_change=None,
                           presort='auto', random_state=None, subsample=1.0,
                           tol=0.0001, validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [15]:
#predicts variables and stores in y_pred
y_pred = regr_1.predict(x_test)
y_pred = regr_2.predict(x_test)
#y_pred = regr_3.predict(x_test)
#y_pred = regr_4.predict(x_test)

#Displays result validity in confusion matrix
#results = pd.DataFrame({'Actual':y_test, 'Predicted':y_pred})
#Confusion Matrix 
print(confusion_matrix(y_test,y_pred))
tn, fp, fn, tp = confusion_matrix(y_test,y_pred).ravel()
#lists out values
print(tn, fp, fn, tp)
#computes ratio 
print((fp+fn)/(tn+tp))

In [16]:
score = regr_1.score(x_test, y_test)
print(score)
score = regr_2.score(x_test, y_test)
print(score)
#score = regr_3.score(x_test, y_test)
#print(score)
#score = regr_4.score(x_test, y_test)
#print(score)

0.27565012663588306
0.7142857142857143


In [20]:
#Input with Prediction Mechanism
print('Enter number of employees, state acronym, founding year, & industry. Enter 0 for anything not known:')
mylist = []
for i in range(4):
    i = input()
    mylist.append(i)

e = mylist[0]
s = mylist[1]
y = mylist[2]
i = mylist[3]

df = np.array([0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])

#processes and enters employees in dataframe
e = int(e)
df[1] = e

#processes and establishes year
y = int(y)
y = 2020-y
df[2] = y

#Establishes State Variable
#State_AZ
if(s == 'AZ'):
    df[3] = 1
    df[0]=0
#State_CA
elif(s == 'CA'):
    df[4]=1
    df[0]=1
#State_CO
elif(s == 'CO'):
    df[5]=1
    df[0]=0
#State_FL
elif(s == 'FL'):
    df[6]=1
    df[0]=1
#State_GA
elif(s == 'GA'):
    df[7]=1
    df[0]=0
#State_ID
elif(s == 'ID'):
    df[8]=1
    df[0]=0
#State_IL
elif(s == 'IL'):
    df[9]=1
    df[0]=0
#State_KS
elif(s == 'KS'):
    df[10]=1
    df[0]=0
#State_MD
elif(s == 'MD'):
    df[11]=1
    df[0]=0
#State_MN
elif(s == 'MN'):
    df[12]=1
    df[0]=0
#State_MO
elif(s == 'MO'):
    df[13]=1
    df[0]=1
#State_MT
elif(s == 'MT'):
    df[14]=1
    df[0]=0
#State_NC
elif(s == 'NC'):
    df[15]=1
    df[0]=0
#State_NJ
elif(s == 'NJ'):
    df[16]=1
    df[0]=0
#State_NV
elif(s == 'NV'):
    df[17]=1
    df[0]=0
#State_NY
elif(s == 'NY'):
    df[18]=1
    df[0]=1
#State_OH
elif(s == 'OH'):
    df[19]=1
    df[0]=0
#State_OR
elif(s == 'OR'):
    df[20]=1
    df[0]=0
#State_PA
elif(s == 'PA'):
    df[21]=1
    df[0]=0
#State_TN
elif(s == 'TN'):
    df[22]=1
    df[0]=0
#State_TX
elif(s == 'TX'):
    df[23]=1
    df[0]=0
#State_UT
elif(s == 'UT'):
    df[24]=1
    df[0]=0
#State_VA
elif(s == 'VA'):
    df[25]=1
    df[0]=0
#State_WA
elif(s == 'WA'):
    df[26]=1
    df[0]=0
else:
    s = 0

#Establishes Industry Variable  
if(i == 'Aviation & Aerospace'):
    df[27]=1
elif(i == 'Biotechnology'):
    df[28]=1
elif(i == 'Computer & Network Security'):
    df[29]=1
elif(i == 'Computer Software'):
    df[30]=1
elif(i == 'Construction'):
    df[31]=1
elif(i == 'Consumer Electronics'):
    df[32]=1
elif(i == 'Consumer Goods'):
    df[33]=1
elif(i == 'Consumer Services'):
    df[34]=1
elif(i == 'Design'):
    df[35]=1
elif(i == 'E-learning'):
    df[36]=1
elif(i == 'Education Management'):
    df[37]=1
elif(i == 'Financial Services'):
    df[38]=1
elif(i == 'Health, Wellness & Fitness'):
    df[39]=1
elif(i == 'Higher Education'):
    df[40]=1
elif(i == 'Hospital & Health Care'):
    df[41]=1
elif(i == 'Hospitality'):
    df[42]=1
elif(i == 'Information Technology & Services'):
    df[43]=1
elif(i == 'Insurance'):
    df[44]=1
elif(i == 'Internet'):
    df[45]=1
elif(i == 'Logistics & Supply Chain'):
    df[46]=1
elif(i == 'Management Consulting'):
    df[47]=1
elif(i == 'Marketing & Advertising'):
    df[48]=1
elif(i == 'Medical Device'):
    df[49]=1
elif(i == 'Non-profit Organization Management'):
    df[50]=1
elif(i == 'Oil & Energy'):
    df[51]=1
elif(i == 'Online Media'):
    df[52]=1
elif(i == 'Package/Freight Delivery'):
    df[53]=1
elif(i == 'Real Estate'):
    df[54]=1
elif(i == 'Renewables & Environment'):
    df[55]=1
elif(i == 'Retail'):
    df[56]=1
elif(i == 'Semiconductors'):
    df[57]=1
elif(i == 'Sports'):
    df[58]=1
elif(i == 'Staffing & Recruiting'):
    df[59]=1
elif(i == 'Transportation/Trucking/Railroad'):
    df[60]=1
else:
    i = 0

print(df)

df = df.reshape(-1,61)
X = df
print(regr_2.predict(X))
#print(regr_2.predict(X))
#print(regr_3.predict(X))
#print(regr_4.predict(X))
#print(r.score(X))

Enter number of employees, state acronym, founding year, & industry. Enter 0 for anything not known:
52 CA 1997 Transportation/Trucking/Railroad





ValueError: invalid literal for int() with base 10: '52 CA 1997 Transportation/Trucking/Railroad'

In [18]:
features = c.columns
features = features.drop('Qualification')

In [None]:
#Visualizes Employee Count versus Qualification
c.Employees = c.Employees.astype(int)
ax1 = c.plot(x='Employees',y='Qualification',kind='scatter')
#Visualizes Years versus Qualification
ax2 = c.plot(x='Year',y='Qualification',kind='scatter')
ax2.set_xlim(1970,2020)
ax1
ax2