In [None]:
# All the dependency imports

# Data Analysis
import numpy as np
import pandas as pd

#data visualisation
import seaborn as sns
import matplotlib.pyplot as plt

#ignore warnings
sns.set_style('dark')
import warnings
warnings.filterwarnings('ignore')


# label encoding the data 
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder() 
te = LabelEncoder()
ce = LabelEncoder()

# le = te = ce = LabelEncoder()


from sklearn.metrics import classification_report

# Model Prediction

In [None]:
# Machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [None]:
# Setup random seed
np.random.seed(3)
# Load data 

jobs = pd.read_csv("./data/jobs.csv")
jobs.columns =['id', 'title', 'classification','company']
jobs.head()


In [None]:
jobs.classification.unique()

In [None]:
jobs['classification']= le.fit_transform(jobs['classification']) 
jobs['title']= le.fit_transform(jobs['title']) 
jobs['company']= le.fit_transform(jobs['company'])   


In [None]:
len(jobs)


In [None]:
jobs.isna().sum()

In [None]:
print(jobs.columns.values)

In [None]:
print(jobs.shape)

In [None]:
jobs.info()

In [None]:
# No need for making each one categorical as below. it has already done using Label encoder
# jobs['title']=jobs['title'].astype('category')
# jobs['classification']=jobs['classification'].astype('category')
# jobs['company']=jobs['company'].astype('category')

# jobs.info()

In [None]:
jobs.describe()

# Unique value counts

In [None]:
jobs['classification'].value_counts(normalize=True)

# Data Visualization

In [None]:
sns.countplot(x='company',data=jobs)

In [None]:
label_names = jobs['classification']
label_names

In [None]:
jobs[["classification"]]

In [None]:
# Setup random seed
np.random.seed(3)

# Make the data
# X = jobs.drop("classification", axis=1)
X = jobs[['title','company']]
y = jobs[["classification"]]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2)

In [None]:
# Put models in a dictionary
models = {"Logistic Regression":LogisticRegression(),
          "Random Forest":RandomForestClassifier(),
          "KNN":KNeighborsClassifier(),
          "SGD":SGDClassifier(),
          "DT":DecisionTreeClassifier()
          }

# Create a function to fit and score models
def fit_and_score(models,X_train,X_test,y_train,y_test):
    """
    Fits and evaluates given machine learning models.
    models : a dict of different Scikit-Learn machine learning models
    X_train : training data
    X_test : testing data
    y_train : labels assosciated with training data
    y_test : labels assosciated with test data
    """
    # Random seed for reproducing same result
    np.random.seed(3)
    
    # Make a list to keep model scores
    model_scores = {}
    
    #Loop through models
    for name,model in models.items():
        # Fit the model to the data
        model.fit(X_train,y_train)
        
        # Score the model and append it to corresponding models
        model_scores[name] = model.score(X_test,y_test)
    
    return model_scores
        


In [None]:
model_scores = fit_and_score(models = models,X_train = X_train, X_test = X_test,y_train = y_train, y_test = y_test)
model_scores

In [None]:
DT = DecisionTreeClassifier();
DT.fit(X_train,y_train)
DT.score(X_test,y_test)

# Model Comparison

In [None]:
model_compare = pd.DataFrame(model_scores,index = ['accuracy'])
model_compare.T.plot.bar()

In [None]:
X_test


In [None]:
# decision_tree_Y_pred = decision_tree.predict(X_test)
y_predict = DT.predict(X_test)
y_predict

In [None]:

print(classification_report(y_test, y_predict))

In [None]:
jobs_predict = pd.read_csv("./data/jobs.csv")
jobs_predict.columns =['id', 'title', 'classification','company']
le.fit(jobs_predict['classification'])
te.fit(jobs_predict['title'])
ce.fit(jobs_predict['company'])

In [None]:
ce.classes_

In [None]:
le.classes_

In [None]:
te.classes_

In [None]:
title_map  = {i: l for i, l in enumerate(te.classes_)}
classification_map = {i: l for i, l in enumerate(le.classes_)}
company_map  = {i: l for i, l in enumerate(ce.classes_)}

In [None]:
output = pd.DataFrame({"y_test":y_test.classification.to_list(),"y_pred":list(y_predict)})

In [None]:
y_test_value = list(output.y_test.map(classification_map))
y_pred_value = list(output.y_pred.map(classification_map))

In [None]:
X_title_value = list(X_test.title.map(title_map))
X_company_value = list(X_test.company.map(company_map))


In [None]:
new_output = pd.DataFrame({"Title":X_title_value,"Company":X_company_value,"Actual":y_test_value,"Predicted":y_pred_value})
new_output

In [None]:
# new_output.to_csv('newoutput.csv')

# Validating the model with a new set of inputs

In [None]:
test_company_encoder = LabelEncoder() 
test_title_encoder = LabelEncoder()

In [None]:
jobs_test = pd.read_csv("./data/jobs_test.csv")
jobs_test.head()

In [None]:
# mongo_id = jobs_test[['_id']]
# job_id = jobs_test[['jobId']]
job_classification = jobs_test[['jobClassification']]
jobs_test = jobs_test.drop(['jobClassification'], axis=1)

In [None]:
jobs_test.columns =[ 'id','jobid','title','company']
test_title_encoder.fit(jobs_test['title'])
test_company_encoder.fit(jobs_test['company'])

In [None]:
jobs_test_numeric = pd.DataFrame()
jobs_test_numeric['title']= test_title_encoder.fit_transform(jobs_test['title']) 
jobs_test_numeric['company']= test_company_encoder.fit_transform(jobs_test['company'])   
jobs_test_numeric.head()

In [None]:
result = DT.predict(jobs_test_numeric)
len(result)
result

In [None]:
# res_df = pd.DataFrame({"label_names":job_classification})
# res_df
# job_classification

In [None]:
final = pd.DataFrame({"_id":jobs_test.id,"JobId":jobs_test.jobid,"jobTitle":jobs_test.title,"jobCompany":jobs_test.company})
final