# Importing Libraries

In [108]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.feature_selection import chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [109]:
columns = [0,1] 
events = pd.read_excel(r'D:\Event.xlsx',usecols= columns)
target = events.Type.unique().tolist()

# Data pre-processing..!!

In [110]:
events['category_id'] = events['Type'].factorize()[0]

In [111]:
category_df = events[['Type', 'category_id']].drop_duplicates().sort_values('category_id')
category_2_id = dict(category_df.values)
id_2_category = dict(category_df[['category_id', 'Type']].values)

In [112]:
stops_words = text.ENGLISH_STOP_WORDS.difference(["AI", "ai"])
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=2, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words= stops_words, token_pattern = r"(?u)c\+{2}|\b\w+\b")
features = tfidf.fit_transform(events.Events.values)
labels = events.Type
features.shape

(1082, 1217)

In [113]:
N = 2
for Product, category_id in sorted(category_2_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 'AI.Certifications':
  . Most correlated unigrams:
. dynamics
. year
  . Most correlated bigrams:
. e certificate
. embedded systems
# 'AI.Courses':
  . Most correlated unigrams:
. dynamics
. year
  . Most correlated bigrams:
. e certificate
. embedded systems
# 'AI.Expos':
  . Most correlated unigrams:
. dynamics
. year
  . Most correlated bigrams:
. e certificate
. embedded systems
# 'AI.Fests':
  . Most correlated unigrams:
. dynamics
. year
  . Most correlated bigrams:
. e certificate
. embedded systems
# 'AI.Hackathons':
  . Most correlated unigrams:
. dynamics
. year
  . Most correlated bigrams:
. e certificate
. embedded systems
# 'AI.Internships':
  . Most correlated unigrams:
. dynamics
. year
  . Most correlated bigrams:
. e certificate
. embedded systems
# 'AI.Jobs':
  . Most correlated unigrams:
. dynamics
. year
  . Most correlated bigrams:
. e certificate
. embedded systems
# 'AI.Seminars':
  . Most correlated unigrams:
. dynamics
. year
  . Most correlated bigrams:
. e

# Data modelling

In [114]:
model = RandomForestClassifier()
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, events.index, test_size=0.01, random_state=0)

In [115]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [116]:
model.score(X_train , y_train)

0.9971988795518207

**We got 99.7% accuracy which is very good**

In [117]:
print("Confusion matrix is\n",confusion_matrix(y_test,y_pred))
print("Classification report is\n",classification_report(y_test,y_pred))

Confusion matrix is
 [[0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]]
Classification report is
                                    precision    recall  f1-score   support

           Blockchain.Internships       0.00      0.00      0.00         1
                  Blockchain.Jobs       0.00      0.00      0.00         0
              Blockchain.Seminars       1.00      1.00      1.00         2
                      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [118]:
model = LinearSVC()
model.fit(features, labels)

LinearSVC()

In [119]:

N = 2
for Product, category_id in sorted(category_2_id.items()):
    indices = np.argsort(model.coef_[category_id])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 1][:N]
    bigrams = [v for v in reversed(feature_names) if len(v.split(' ')) == 2][:N]
    print("# '{}':".format(Product))
    print("  . Top unigrams:\n       . {}".format('\n       . '.join(unigrams)))
    print("  . Top bigrams:\n       . {}".format('\n       . '.join(bigrams)))


# 'AI.Certifications':
  . Top unigrams:
       . project
       . c
  . Top bigrams:
       . job opening
       . time job
# 'AI.Courses':
  . Top unigrams:
       . uses
       . c
  . Top bigrams:
       . conferences seminars
       . programming language
# 'AI.Expos':
  . Top unigrams:
       . c
       . live
  . Top bigrams:
       . participate c
       . hackathons codeathons
# 'AI.Fests':
  . Top unigrams:
       . expo
       . fair
  . Top bigrams:
       . c expo
       . c attend
# 'AI.Hackathons':
  . Top unigrams:
       . fellow
       . event
  . Top bigrams:
       . compete fellow
       . think know
# 'AI.Internships':
  . Top unigrams:
       . bootcamp
       . c
  . Top bigrams:
       . c training
       . join free
# 'AI.Jobs':
  . Top unigrams:
       . popular
       . c
  . Top bigrams:
       . talk series
       . talks speeches
# 'AI.Seminars':
  . Top unigrams:
       . fest
       . dedicated
  . Top bigrams:
       . fest coming
       . annual c
# '

In [120]:
def get_query(event_dom, event_type, employees):
    return employees.query("Domain == '" + event_dom + "' and (Event1 == '" + event_type +"' or Event2 == '" + event_type + "')")

In [121]:
def predict(events, employees):
        recommendations = []
        pred = model.predict(tfidf.transform(events))  
        for text, predicted in zip(events, pred):
            print('"{}"'.format(text))
            print("  - Predicted as: '{}'".format(predicted))
            print("")
        for prediction in pred.tolist():
            domain, event_type = prediction.split(".")
            if domain == 'AI':
                recommend_to = get_query('Artificial Intelligence', event_type, employees)
                
            elif domain == 'WebDev':
                recommend_to = get_query('Web Development', event_type, employees)
                
            elif domain == 'Mobile_app':
                recommend_to = get_query('Mobile Applications', event_type, employees)
                
            elif domain == 'ML':
                recommend_to = get_query('Machine Learning', event_type, employees)
                
            elif domain == 'CC':
                recommend_to = get_query('Cloud Computing', event_type, employees)
                
            elif domain == 'Higher_Edu':
                recommend_to = get_query('Higher Education', event_type, employees)
                
            elif domain == 'DevOps':
                recommend_to = get_query('Development Processes', event_type, employees)
                
            elif domain == 'Software_Architecture':
                recommend_to = get_query('Software Architecture', event_type, employees)
                
            elif domain == 'Data_Science':
                recommend_to = get_query('Data Science', event_type, employees)
                
            elif domain == 'Cpp':
                recommend_to = get_query('C++', event_type, employees)
                
            elif domain == 'None':
                recommend_to = employees.query("Event1 == '" + event_type + "' or Event2 == '" + event_type + "'")
                
            else:
                recommend_to = get_query(domain, event_type, employees)
                
            recommendations.append(", ".join(recommend_to['Name'].values))
            
        return recommendations


In [122]:
def make_predictions():
    employees = pd.read_csv(r'D:\CCMLEmployeeData.csv')
    to_pred_events = pd.read_csv(r'D:\events.csv', encoding= 'unicode_escape')
    recommendations = predict(to_pred_events.Events, employees)
    to_pred_events['Employees'] = recommendations
    to_pred_events.to_excel(r'D:\outys.xlsx', index=False)

In [123]:
make_predictions()

" "Get a System Administration certification from PurpleHat today.""
  - Predicted as: 'Finance.Certifications'

" "Lockdown special courses on Ydemi. 22 hours left!""
  - Predicted as: 'C.Courses'

" "CodeBoost codeathon is live now!""
  - Predicted as: 'C.Hackathons'

"" How To Ace Job Interviews" Seminar by Cloud Councleage"
  - Predicted as: 'Python.Jobs'

"Live Webinar: Using AI as Prediction Machines"
  - Predicted as: 'Coding.Webinars'

"Analytics Vidhya invites all AI aspirants, data professionals, data scientists, business analytics professionals, and academicians for an exciting webinar on ?Using AI as Prediction Machines? "
  - Predicted as: 'AI.Webinars'

"Free Virtual Session on IOT (26th July20- 10th July 28)"
  - Predicted as: 'IoT.Hackathons'

"Machine Learning with Python & Cloud Deployment"
  - Predicted as: 'ML.Trainings'

"Get free certification with training in trending technical domains"
  - Predicted as: 'Cpp.Trainings'

"CodeChef's November Challenge 2020"
  - P