In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from io import StringIO

df = pd.read_csv('Train.csv')
#df.head()

col = ['Category', 'Job_Title']
df = df[col]

df.columns = ['Category', 'Job_Title']

df['category_id'] = df['Category'].factorize()[0]

category_id_df = df[['Category', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'Category']].values)

#df.head()


fig = plt.figure(figsize=(8,6))
df.groupby('Category').Job_Title.count().plot.bar(ylim=0)
plt.show()


#calculate a tf-idf vector 
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.Job_Title).toarray()
labels = df.category_id
features.shape



N = 2
for Category, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Category))
    print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))




X_train, X_test, y_train, y_test = train_test_split(df['Job_Title'], df['Category'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

clf = MultinomialNB().fit(X_train_tfidf, y_train)

company = str(raw_input("Enter Company Name:"))
job_title =str(raw_input("Enter Job Title:"))


df_company = pd.read_csv('{}.csv'.format(company))

                      
list = df_company["Jobs"].tolist()

Prediction = []
i=2
for i in list:
    Prediction.append(clf.predict(count_vect.transform([i])))    
    df1 = pd.DataFrame({'Predicted_Categories':Prediction})
    df_company["Prediction"] = df1["Predicted_Categories"]

#df_company.to_csv('Johnson&Johnson_Jobs.csv', sep='\t', encoding='utf-8')
#df_company['Prediction'].value_counts()



print(clf.predict(count_vect.transform([job_title])))

def model_evaluation():
    models = [RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),]
    CV = 5
    cv_df = pd.DataFrame(index=range(CV * len(models)))
    entries = []
    for model in models:
        model_name = model.__class__.__name__
        accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
        for fold_idx, accuracy in enumerate(accuracies):
            entries.append((model_name, fold_idx, accuracy))
    cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
    
    return cv_df.groupby('model_name').accuracy.mean()

accuracy_check = model_evaluation()

print accuracy_check   

  from numpy.core.umath_tests import inner1d


<Figure size 800x600 with 1 Axes>

# 'Business Operations':
  . Most correlated unigrams:
       . business
       . operations
  . Most correlated bigrams:
       . operations analyst
       . business operations
# 'Customer Service':
  . Most correlated unigrams:
       . service
       . customer
  . Most correlated bigrams:
       . service representative
       . customer service
# 'Engineering, Research and Development':
  . Most correlated unigrams:
       . engineer
       . research
  . Most correlated bigrams:
       . research associate
       . research development
# 'Leadership':
  . Most correlated unigrams:
       . chief
       . director
  . Most correlated bigrams:
       . director ofoperations
       . vice president
# 'Other':
  . Most correlated unigrams:
       . human
       . attorney
  . Most correlated bigrams:
       . associate attorney
       . human resources
# 'Sales and Marketing':
  . Most correlated unigrams:
       . sales
       . marketing
  . Most correlated bigrams:
       . marke