In [2]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

from matplotlib.ticker import MaxNLocator
import matplotlib.gridspec as gridspec
import matplotlib.patches as mpatches

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.metrics import f1_score, accuracy_score


# Keras
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.models import Sequential
# from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
# from keras import utils
# from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import nltk
nltk.download('omw-1.4')

# Word2vec
# import gensim
# from gensim.test.utils import common_texts
# from gensim.models import Word2Vec


# Utility
import string
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools
import random
import datetime

# # WordCloud
# from PIL import Image
# from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
# from collections import Counter, defaultdict

# Warnings
import warnings 
warnings.filterwarnings('ignore')

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xizhima/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /Users/xizhima/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/xizhima/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /Users/xizhima/nltk_data...
[nltk_data] Downloading package omw-1.4 to /Users/xizhima/nltk_data...


In [3]:
import pandas as pd

### **Step - 1.2 :** *Importing Dataset*

In [4]:
df = pd.read_csv("linkedin-jobs-usa.csv")
df1 = df

In [4]:
df.head(5)

Unnamed: 0,title,company,description,onsite_remote,salary,location,criteria,posted_date,link
0,Data Analyst - Recent Graduate,PayPal,"At PayPal (NASDAQ: PYPL), we believe that ever...",onsite,,Buffalo-Niagara Falls Area,"[{'Seniority level': 'Not Applicable'}, {'Empl...",2022-11-22,https://www.linkedin.com/jobs/view/data-analys...
1,Data Analyst - Recent Graduate,PayPal,"At PayPal (NASDAQ: PYPL), we believe that ever...",onsite,,"San Jose, CA","[{'Seniority level': 'Not Applicable'}, {'Empl...",2022-11-22,https://www.linkedin.com/jobs/view/data-analys...
2,Data Analyst,PayPal,"At PayPal (NASDAQ: PYPL), we believe that ever...",onsite,,"Texas, United States","[{'Seniority level': 'Not Applicable'}, {'Empl...",2022-11-17,https://www.linkedin.com/jobs/view/data-analys...
3,Data Analyst,PayPal,"At PayPal (NASDAQ: PYPL), we believe that ever...",onsite,,"Illinois, United States","[{'Seniority level': 'Not Applicable'}, {'Empl...",2022-11-17,https://www.linkedin.com/jobs/view/data-analys...
4,Entry-Level Data Analyst,The Federal Savings Bank,"The Federal Savings Bank, a national bank and ...",onsite,,"Chicago, IL","[{'Seniority level': 'Entry level'}, {'Employm...",2022-11-17,https://www.linkedin.com/jobs/view/entry-level...


In [5]:
mdf = pd.DataFrame()

In [6]:
mdf['text'] = (df['title'] + ' ' + df['company'] + ' ' + df['description'] + ' ' + df['location']).copy()

In [7]:
mdf['link'] = df['link'].copy()

In [8]:
# Varaibles related to dataset
DATASET_COLUMNS = ["clean_text", "category"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# Parameters for WORD2VEC
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# # Parameters related to KERAS
# SEQUENCE_LENGTH = 300
# EPOCHS = 8
# BATCH_SIZE = 1024

# Variable specific to SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# Variables for Exporting purpose
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

# MISC

plt.style.use('fivethirtyeight')
pd.options.display.max_columns = 250
pd.options.display.max_rows = 250


### **Step - 2.2 :** Null Values Identification and Treatment

In [9]:
# Confirming presence of missing values in each variable
df.isnull().sum()

title               0
company             0
description         0
onsite_remote       0
salary           1916
location            0
criteria            0
posted_date         0
link                0
dtype: int64

In [10]:
mdf.isnull().sum()

text    0
link    0
dtype: int64

**Note**: No Nulls present

### **Step - 2.3 :** Label Encoding

### **Step - 2.4 :** Text Manipulation and Structurization

In [11]:
# Including Stopwords and exclusing 'not' related stopwords

stop_words = set(stopwords.words("english"))
stop_words.remove('not')
more_stopwords = {'one', 'br', 'Po', 'th', 'sayi', 'fo', 'Unknown'}
stop_words = stop_words.union(more_stopwords)
                 
stemmer = SnowballStemmer("english")

In [12]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'^[^ ]<.*?>|&([a-z0-9]+|#[0-9]\"\'\“{1,6}|#x[0-9a-f]{1,6});[^A-Za-z0-9]+')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

def remove_quotes(text):
    quotes = re.compile(r'[^A-Za-z0-9\s]+')
    return re.sub(quotes, '', text)


# Applying helper functions

mdf['mod_text'] = mdf['text'].apply(lambda x: remove_URL(x))
mdf['mod_text'] = mdf['mod_text'].apply(lambda x: remove_emoji(x))
mdf['mod_text'] = mdf['mod_text'].apply(lambda x: remove_html(x))
mdf['mod_text'] = mdf['mod_text'].apply(lambda x: remove_punct(x))
mdf['mod_text'] = mdf['mod_text'].apply(lambda x: remove_quotes(x))
mdf.head()

Unnamed: 0,text,link,mod_text
0,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...
1,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...
2,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...
3,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...
4,Entry-Level Data Analyst The Federal Savings B...,https://www.linkedin.com/jobs/view/entry-level...,EntryLevel Data Analyst The Federal Savings Ba...


In [13]:
# Tokenizing the tweet base texts.

mdf['tokenized'] = mdf['mod_text'].apply(word_tokenize)

mdf.head()

Unnamed: 0,text,link,mod_text,tokenized
0,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ..."
1,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ..."
2,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY..."
3,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY..."
4,Entry-Level Data Analyst The Federal Savings B...,https://www.linkedin.com/jobs/view/entry-level...,EntryLevel Data Analyst The Federal Savings Ba...,"[EntryLevel, Data, Analyst, The, Federal, Savi..."


In [14]:
# Lower casing clean text.

mdf['lower'] = mdf['tokenized'].apply(
    lambda x: [word.lower() for word in x])

mdf.head()

Unnamed: 0,text,link,mod_text,tokenized,lower
0,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ...","[data, analyst, recent, graduate, paypal, at, ..."
1,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ...","[data, analyst, recent, graduate, paypal, at, ..."
2,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY...","[data, analyst, paypal, at, paypal, nasdaq, py..."
3,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY...","[data, analyst, paypal, at, paypal, nasdaq, py..."
4,Entry-Level Data Analyst The Federal Savings B...,https://www.linkedin.com/jobs/view/entry-level...,EntryLevel Data Analyst The Federal Savings Ba...,"[EntryLevel, Data, Analyst, The, Federal, Savi...","[entrylevel, data, analyst, the, federal, savi..."


In [15]:
# Removing stopwords.

mdf['stopwords_removed'] = mdf['lower'].apply(
    lambda x: [word for word in x if word not in stop_words])

mdf.head()

Unnamed: 0,text,link,mod_text,tokenized,lower,stopwords_removed
0,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ...","[data, analyst, recent, graduate, paypal, at, ...","[data, analyst, recent, graduate, paypal, payp..."
1,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ...","[data, analyst, recent, graduate, paypal, at, ...","[data, analyst, recent, graduate, paypal, payp..."
2,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY...","[data, analyst, paypal, at, paypal, nasdaq, py...","[data, analyst, paypal, paypal, nasdaq, pypl, ..."
3,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY...","[data, analyst, paypal, at, paypal, nasdaq, py...","[data, analyst, paypal, paypal, nasdaq, pypl, ..."
4,Entry-Level Data Analyst The Federal Savings B...,https://www.linkedin.com/jobs/view/entry-level...,EntryLevel Data Analyst The Federal Savings Ba...,"[EntryLevel, Data, Analyst, The, Federal, Savi...","[entrylevel, data, analyst, the, federal, savi...","[entrylevel, data, analyst, federal, savings, ..."


In [16]:
# Applying part of speech tags.

mdf['pos_tags'] = mdf['stopwords_removed'].apply(nltk.tag.pos_tag)

mdf.head()

Unnamed: 0,text,link,mod_text,tokenized,lower,stopwords_removed,pos_tags
0,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ...","[data, analyst, recent, graduate, paypal, at, ...","[data, analyst, recent, graduate, paypal, payp...","[(data, NNS), (analyst, NN), (recent, JJ), (gr..."
1,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ...","[data, analyst, recent, graduate, paypal, at, ...","[data, analyst, recent, graduate, paypal, payp...","[(data, NNS), (analyst, NN), (recent, JJ), (gr..."
2,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY...","[data, analyst, paypal, at, paypal, nasdaq, py...","[data, analyst, paypal, paypal, nasdaq, pypl, ...","[(data, NNS), (analyst, NN), (paypal, NN), (pa..."
3,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY...","[data, analyst, paypal, at, paypal, nasdaq, py...","[data, analyst, paypal, paypal, nasdaq, pypl, ...","[(data, NNS), (analyst, NN), (paypal, NN), (pa..."
4,Entry-Level Data Analyst The Federal Savings B...,https://www.linkedin.com/jobs/view/entry-level...,EntryLevel Data Analyst The Federal Savings Ba...,"[EntryLevel, Data, Analyst, The, Federal, Savi...","[entrylevel, data, analyst, the, federal, savi...","[entrylevel, data, analyst, federal, savings, ...","[(entrylevel, NN), (data, NNS), (analyst, NN),..."


In [17]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


mdf['wordnet_pos'] = mdf['pos_tags'].apply(
    lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

mdf.head()

Unnamed: 0,text,link,mod_text,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos
0,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ...","[data, analyst, recent, graduate, paypal, at, ...","[data, analyst, recent, graduate, paypal, payp...","[(data, NNS), (analyst, NN), (recent, JJ), (gr...","[(data, n), (analyst, n), (recent, a), (gradua..."
1,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ...","[data, analyst, recent, graduate, paypal, at, ...","[data, analyst, recent, graduate, paypal, payp...","[(data, NNS), (analyst, NN), (recent, JJ), (gr...","[(data, n), (analyst, n), (recent, a), (gradua..."
2,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY...","[data, analyst, paypal, at, paypal, nasdaq, py...","[data, analyst, paypal, paypal, nasdaq, pypl, ...","[(data, NNS), (analyst, NN), (paypal, NN), (pa...","[(data, n), (analyst, n), (paypal, n), (paypal..."
3,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY...","[data, analyst, paypal, at, paypal, nasdaq, py...","[data, analyst, paypal, paypal, nasdaq, pypl, ...","[(data, NNS), (analyst, NN), (paypal, NN), (pa...","[(data, n), (analyst, n), (paypal, n), (paypal..."
4,Entry-Level Data Analyst The Federal Savings B...,https://www.linkedin.com/jobs/view/entry-level...,EntryLevel Data Analyst The Federal Savings Ba...,"[EntryLevel, Data, Analyst, The, Federal, Savi...","[entrylevel, data, analyst, the, federal, savi...","[entrylevel, data, analyst, federal, savings, ...","[(entrylevel, NN), (data, NNS), (analyst, NN),...","[(entrylevel, n), (data, n), (analyst, n), (fe..."


In [18]:
# Applying word lemmatizer.

wnl = WordNetLemmatizer()

mdf['lemmatized'] = mdf['wordnet_pos'].apply(
    lambda x: [wnl.lemmatize(word, tag) for word, tag in x])

mdf['lemmatized'] = mdf['lemmatized'].apply(
    lambda x: [word for word in x if word not in stop_words])

mdf['lemma_str'] = [' '.join(map(str, l)) for l in mdf['lemmatized']]

mdf.head()

Unnamed: 0,text,link,mod_text,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,lemma_str
0,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ...","[data, analyst, recent, graduate, paypal, at, ...","[data, analyst, recent, graduate, paypal, payp...","[(data, NNS), (analyst, NN), (recent, JJ), (gr...","[(data, n), (analyst, n), (recent, a), (gradua...","[data, analyst, recent, graduate, paypal, payp...",data analyst recent graduate paypal paypal nas...
1,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ...","[data, analyst, recent, graduate, paypal, at, ...","[data, analyst, recent, graduate, paypal, payp...","[(data, NNS), (analyst, NN), (recent, JJ), (gr...","[(data, n), (analyst, n), (recent, a), (gradua...","[data, analyst, recent, graduate, paypal, payp...",data analyst recent graduate paypal paypal nas...
2,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY...","[data, analyst, paypal, at, paypal, nasdaq, py...","[data, analyst, paypal, paypal, nasdaq, pypl, ...","[(data, NNS), (analyst, NN), (paypal, NN), (pa...","[(data, n), (analyst, n), (paypal, n), (paypal...","[data, analyst, paypal, paypal, nasdaq, pypl, ...",data analyst paypal paypal nasdaq pypl believe...
3,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY...","[data, analyst, paypal, at, paypal, nasdaq, py...","[data, analyst, paypal, paypal, nasdaq, pypl, ...","[(data, NNS), (analyst, NN), (paypal, NN), (pa...","[(data, n), (analyst, n), (paypal, n), (paypal...","[data, analyst, paypal, paypal, nasdaq, pypl, ...",data analyst paypal paypal nasdaq pypl believe...
4,Entry-Level Data Analyst The Federal Savings B...,https://www.linkedin.com/jobs/view/entry-level...,EntryLevel Data Analyst The Federal Savings Ba...,"[EntryLevel, Data, Analyst, The, Federal, Savi...","[entrylevel, data, analyst, the, federal, savi...","[entrylevel, data, analyst, federal, savings, ...","[(entrylevel, NN), (data, NNS), (analyst, NN),...","[(entrylevel, n), (data, n), (analyst, n), (fe...","[entrylevel, data, analyst, federal, saving, b...",entrylevel data analyst federal saving bank fe...


In [19]:
mdf.head()

Unnamed: 0,text,link,mod_text,tokenized,lower,stopwords_removed,pos_tags,wordnet_pos,lemmatized,lemma_str
0,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ...","[data, analyst, recent, graduate, paypal, at, ...","[data, analyst, recent, graduate, paypal, payp...","[(data, NNS), (analyst, NN), (recent, JJ), (gr...","[(data, n), (analyst, n), (recent, a), (gradua...","[data, analyst, recent, graduate, paypal, payp...",data analyst recent graduate paypal paypal nas...
1,Data Analyst - Recent Graduate PayPal At PayPa...,https://www.linkedin.com/jobs/view/data-analys...,Data Analyst Recent Graduate PayPal At PayPal...,"[Data, Analyst, Recent, Graduate, PayPal, At, ...","[data, analyst, recent, graduate, paypal, at, ...","[data, analyst, recent, graduate, paypal, payp...","[(data, NNS), (analyst, NN), (recent, JJ), (gr...","[(data, n), (analyst, n), (recent, a), (gradua...","[data, analyst, recent, graduate, paypal, payp...",data analyst recent graduate paypal paypal nas...
2,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY...","[data, analyst, paypal, at, paypal, nasdaq, py...","[data, analyst, paypal, paypal, nasdaq, pypl, ...","[(data, NNS), (analyst, NN), (paypal, NN), (pa...","[(data, n), (analyst, n), (paypal, n), (paypal...","[data, analyst, paypal, paypal, nasdaq, pypl, ...",data analyst paypal paypal nasdaq pypl believe...
3,"Data Analyst PayPal At PayPal (NASDAQ: PYPL), ...",https://www.linkedin.com/jobs/view/data-analys...,Data Analyst PayPal At PayPal NASDAQ PYPL we b...,"[Data, Analyst, PayPal, At, PayPal, NASDAQ, PY...","[data, analyst, paypal, at, paypal, nasdaq, py...","[data, analyst, paypal, paypal, nasdaq, pypl, ...","[(data, NNS), (analyst, NN), (paypal, NN), (pa...","[(data, n), (analyst, n), (paypal, n), (paypal...","[data, analyst, paypal, paypal, nasdaq, pypl, ...",data analyst paypal paypal nasdaq pypl believe...
4,Entry-Level Data Analyst The Federal Savings B...,https://www.linkedin.com/jobs/view/entry-level...,EntryLevel Data Analyst The Federal Savings Ba...,"[EntryLevel, Data, Analyst, The, Federal, Savi...","[entrylevel, data, analyst, the, federal, savi...","[entrylevel, data, analyst, federal, savings, ...","[(entrylevel, NN), (data, NNS), (analyst, NN),...","[(entrylevel, n), (data, n), (analyst, n), (fe...","[entrylevel, data, analyst, federal, saving, b...",entrylevel data analyst federal saving bank fe...


In [20]:
Mdf = mdf[["lemma_str", "link"]]
Mdf.head()

Unnamed: 0,lemma_str,link
0,data analyst recent graduate paypal paypal nas...,https://www.linkedin.com/jobs/view/data-analys...
1,data analyst recent graduate paypal paypal nas...,https://www.linkedin.com/jobs/view/data-analys...
2,data analyst paypal paypal nasdaq pypl believe...,https://www.linkedin.com/jobs/view/data-analys...
3,data analyst paypal paypal nasdaq pypl believe...,https://www.linkedin.com/jobs/view/data-analys...
4,entrylevel data analyst federal saving bank fe...,https://www.linkedin.com/jobs/view/entry-level...


### **Step - 2.4 :** Train-Test Split

In [21]:
TRAIN_SIZE = 0.8

In [22]:
df_train, df_test = train_test_split(Mdf, test_size=1-TRAIN_SIZE, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 2276
TEST size: 569


In [23]:
from sklearn.naive_bayes import MultinomialNB

In [24]:
# import pandas as pd
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.svm import SVC
# from sklearn.metrics import accuracy_score

# # Load the preprocessed data

# # Extract features from the text data using TF-IDF representation
# vectorizer = TfidfVectorizer()
# X = vectorizer.fit_transform(df['mod_text'])

# # Split the dataset into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, df['category'], test_size=0.2, random_state=42)

# # Train an SVM classifier on the training set
# svm = SVC(kernel='rbf', C=0.1)
# svm.fit(X_train, y_train)

# # Evaluate the performance of the trained model on the test set
# y_pred = svm.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, y_pred))

In [27]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

# Load the preprocessed data
# tweets = pd.read_csv('preprocessed_tweets.csv')

# Extract features from the text data using TF-IDF representation
vectorizer = TfidfVectorizer(max_features=300)
# vectorizer = CountVectorizer()
X = vectorizer.fit_transform(Mdf['lemma_str'])

In [38]:
vec = X.toarray().tolist()
Mdf['vec'] = vec

In [39]:
Mdf

Unnamed: 0,lemma_str,link,vec
0,data analyst recent graduate paypal paypal nas...,https://www.linkedin.com/jobs/view/data-analys...,"[0.025721760265704163, 0.032294339663539375, 0..."
1,data analyst recent graduate paypal paypal nas...,https://www.linkedin.com/jobs/view/data-analys...,"[0.025736764674450414, 0.03231317808935069, 0...."
2,data analyst paypal paypal nasdaq pypl believe...,https://www.linkedin.com/jobs/view/data-analys...,"[0.06546966502717229, 0.027399622967804495, 0...."
3,data analyst paypal paypal nasdaq pypl believe...,https://www.linkedin.com/jobs/view/data-analys...,"[0.06549983815092186, 0.027412250681939827, 0...."
4,entrylevel data analyst federal saving bank fe...,https://www.linkedin.com/jobs/view/entry-level...,"[0.11848594440451815, 0.09917479193824005, 0.0..."
...,...,...,...
2840,junior data analyst iris software inc iriss cl...,https://www.linkedin.com/jobs/view/junior-data...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2841,data analyst sql marwood group marwood group m...,https://www.linkedin.com/jobs/view/data-analys...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2842,data analyst smartsense digi join highperformi...,https://www.linkedin.com/jobs/view/data-analys...,"[0.10833037967110319, 0.06800580599538941, 0.1..."
2843,data analyst synergy search nashville berry hi...,https://www.linkedin.com/jobs/view/data-analys...,"[0.051153514872604604, 0.0, 0.0, 0.0, 0.0, 0.0..."


In [34]:
vec.apply(lambda x: list(x))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,...,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,0.025722,0.032294,0.037764,0.080292,0.036333,0.000000,0.000000,0.0,0.039351,0.091775,0.037889,0.000000,0.041948,0.050056,0.000000,0.000000,0.000000,0.063850,0.085851,0.072628,0.000000,0.000000,0.034142,0.042152,0.035534,0.062335,0.039720,0.131057,0.0,0.0000,0.030577,0.035300,0.084842,0.000000,0.040218,0.000000,0.000000,0.041805,0.000000,0.034564,0.037435,0.038057,0.041831,0.037619,0.032490,0.039955,0.000000,0.032642,0.077980,0.082518,0.037826,0.040607,0.090387,0.090322,0.000000,0.034530,0.031308,0.029306,0.216834,0.000000,0.000000,0.000000,0.000000,0.000000,0.026443,0.000000,0.046577,0.039170,0.000000,0.024498,0.000000,0.037374,0.101827,0.000000,0.137866,0.000000,0.137848,0.000000,0.037826,0.039791,0.000000,0.038484,0.030767,0.031252,0.031969,0.081164,0.073789,0.000000,0.000000,0.000000,0.039838,0.000000,0.058839,0.039283,0.039013,0.042098,0.042152,0.124336,0.037784,0.037743,0.077230,0.000000,0.000000,0.039720,0.074066,0.103284,0.000000,0.0,0.340334,0.037334,0.000000,0.000000,0.000000,0.042757,0.097018,0.057992,0.039192,0.056907,0.036777,0.000000,0.042757,0.041569,0.000000,0.047085,0.035661,...,0.000000,0.000000,0.000000,0.036352,0.037722,0.000000,0.000000,0.046979,0.041005,0.000000,0.036777,0.037578,0.032733,0.084197,0.000000,0.042179,0.000000,0.088437,0.365224,0.092047,0.000000,0.040656,0.000000,0.084739,0.086078,0.000000,0.036486,0.076029,0.067070,0.000000,0.000000,0.000000,0.060145,0.077044,0.055489,0.000000,0.000000,0.000000,0.036895,0.049696,0.042701,0.033043,0.033959,0.0,0.000000,0.036390,0.000000,0.085402,0.236916,0.035771,0.039535,0.091439,0.000000,0.035753,0.037113,0.000000,0.000000,0.000000,0.036954,0.030740,0.056717,0.000000,0.038702,0.000000,0.000000,0.000000,0.041233,0.000000,0.024954,0.000000,0.067885,0.088682,0.036641,0.000000,0.089429,0.085646,0.030618,0.038355,0.036777,0.038724,0.092459,0.000000,0.000000,0.035882,0.023063,0.031195,0.027351,0.037233,0.000000,0.136633,0.000000,0.032460,0.040607,0.023658,0.060041,0.033664,0.032718,0.000000,0.041242,0.059937,0.054414,0.0,0.081213,0.000000,0.000000,0.039861,0.000000,0.033746,0.037013,0.000000,0.000000,0.000000,0.000000,0.066591,0.038768,0.126782,0.000000,0.000000,0.000000,0.097515,0.037784,0.076253,0.028347,0.020711,0.000000
1,0.025737,0.032313,0.037786,0.080338,0.036354,0.000000,0.000000,0.0,0.039374,0.091829,0.037911,0.000000,0.041972,0.050085,0.000000,0.000000,0.000000,0.063887,0.085901,0.072671,0.000000,0.000000,0.000000,0.042177,0.035555,0.062371,0.039744,0.131133,0.0,0.0000,0.030595,0.035320,0.084892,0.000000,0.040241,0.000000,0.000000,0.041829,0.000000,0.034585,0.037457,0.038079,0.041856,0.037641,0.032509,0.039979,0.000000,0.032661,0.078026,0.082566,0.037848,0.040630,0.090439,0.090375,0.000000,0.034550,0.031326,0.029323,0.216961,0.000000,0.000000,0.000000,0.000000,0.000000,0.026458,0.000000,0.046604,0.039193,0.000000,0.024512,0.000000,0.037396,0.101887,0.000000,0.137946,0.000000,0.137928,0.000000,0.037848,0.039814,0.000000,0.038507,0.030785,0.031270,0.031987,0.081212,0.073832,0.000000,0.000000,0.000000,0.039861,0.000000,0.058874,0.039306,0.039035,0.042123,0.042177,0.124408,0.037806,0.037765,0.077275,0.000000,0.000000,0.039744,0.074109,0.103344,0.000000,0.0,0.340532,0.037355,0.000000,0.000000,0.000000,0.042782,0.097074,0.058026,0.039215,0.056940,0.036798,0.000000,0.042782,0.041593,0.000000,0.047112,0.035682,...,0.000000,0.000000,0.000000,0.036373,0.037744,0.000000,0.000000,0.047006,0.041029,0.000000,0.036798,0.037600,0.032753,0.084246,0.000000,0.042204,0.000000,0.088488,0.365437,0.092100,0.000000,0.040680,0.000000,0.084789,0.086128,0.000000,0.036507,0.076073,0.067109,0.000000,0.000000,0.000000,0.060180,0.077089,0.055521,0.000000,0.000000,0.000000,0.036916,0.049725,0.042726,0.033063,0.033979,0.0,0.000000,0.036412,0.000000,0.085452,0.237054,0.035792,0.039558,0.091493,0.000000,0.035774,0.037134,0.000000,0.000000,0.000000,0.036975,0.030758,0.056750,0.000000,0.038725,0.000000,0.000000,0.000000,0.041257,0.000000,0.024968,0.000000,0.067925,0.088734,0.036662,0.000000,0.089481,0.085695,0.030636,0.038377,0.036798,0.038747,0.092512,0.000000,0.000000,0.035903,0.023076,0.031214,0.027367,0.037255,0.000000,0.136713,0.000000,0.032478,0.040630,0.023672,0.060076,0.033684,0.032737,0.000000,0.041267,0.059972,0.054446,0.0,0.081261,0.000000,0.000000,0.039884,0.000000,0.033765,0.037035,0.000000,0.000000,0.000000,0.000000,0.066629,0.038791,0.126856,0.000000,0.000000,0.000000,0.097572,0.037806,0.076297,0.028363,0.020723,0.000000
2,0.065470,0.027400,0.032040,0.068122,0.000000,0.000000,0.077411,0.0,0.033387,0.077865,0.032146,0.065047,0.053385,0.056626,0.074212,0.000000,0.046185,0.054172,0.000000,0.030810,0.030634,0.035925,0.000000,0.035763,0.030148,0.079331,0.033700,0.111193,0.0,0.0000,0.025943,0.029949,0.125970,0.000000,0.034122,0.083698,0.000000,0.035469,0.000000,0.087977,0.031761,0.032289,0.035491,0.000000,0.000000,0.033900,0.000000,0.055389,0.000000,0.000000,0.032093,0.068904,0.076687,0.076632,0.052000,0.029297,0.053125,0.024864,0.070758,0.033214,0.034431,0.033740,0.000000,0.111779,0.000000,0.059387,0.000000,0.033233,0.000000,0.020785,0.028371,0.031709,0.086394,0.000000,0.116970,0.035514,0.116955,0.000000,0.032093,0.033760,0.000000,0.000000,0.078312,0.026515,0.027123,0.034431,0.062605,0.000000,0.077074,0.035491,0.033800,0.069666,0.049921,0.033329,0.033100,0.000000,0.000000,0.131863,0.032058,0.000000,0.032762,0.087619,0.088190,0.033700,0.062840,0.116839,0.034327,0.0,0.000000,0.031675,0.000000,0.000000,0.000000,0.036276,0.082313,0.073804,0.000000,0.024141,0.031203,0.066161,0.000000,0.035268,0.000000,0.059923,0.030256,...,0.106394,0.045943,0.000000,0.030842,0.000000,0.056579,0.063975,0.019929,0.034790,0.032217,0.031203,0.031882,0.027772,0.035718,0.061556,0.000000,0.000000,0.112549,0.387336,0.156191,0.000000,0.000000,0.027956,0.071896,0.073032,0.000000,0.123824,0.064506,0.000000,0.000000,0.032505,0.084921,0.025515,0.021789,0.094157,0.000000,0.000000,0.000000,0.031303,0.042164,0.036229,0.028035,0.000000,0.0,0.033900,0.030875,0.000000,0.072458,0.000000,0.030350,0.033543,0.077580,0.035114,0.000000,0.031488,0.000000,0.000000,0.000000,0.031353,0.026081,0.096241,0.062807,0.032836,0.057652,0.000000,0.037082,0.069968,0.295858,0.042343,0.000000,0.000000,0.112861,0.000000,0.136444,0.000000,0.072665,0.025977,0.032542,0.031203,0.032855,0.058834,0.049643,0.030618,0.000000,0.019567,0.026467,0.046411,0.000000,0.030794,0.115924,0.032271,0.000000,0.034452,0.020072,0.000000,0.000000,0.027759,0.036041,0.139966,0.025426,0.046167,0.0,0.068904,0.000000,0.050436,0.000000,0.087759,0.000000,0.031403,0.030350,0.021237,0.032633,0.000000,0.028249,0.032892,0.107566,0.000000,0.032744,0.030907,0.055157,0.032058,0.097043,0.024050,0.017572,0.127271
3,0.065500,0.027412,0.032055,0.068153,0.000000,0.000000,0.077446,0.0,0.033402,0.077901,0.032161,0.065077,0.053409,0.056652,0.074246,0.000000,0.046207,0.054197,0.000000,0.030824,0.030648,0.035941,0.000000,0.035780,0.030162,0.079367,0.033716,0.111244,0.0,0.0000,0.025955,0.029963,0.126028,0.000000,0.034138,0.083737,0.000000,0.035485,0.000000,0.088017,0.031776,0.032303,0.035508,0.000000,0.000000,0.033915,0.000000,0.055414,0.000000,0.000000,0.032108,0.068936,0.076722,0.076667,0.052024,0.029310,0.053150,0.024875,0.070790,0.033229,0.034447,0.033755,0.000000,0.111830,0.000000,0.059414,0.000000,0.033248,0.000000,0.020794,0.028384,0.031724,0.086434,0.000000,0.117024,0.035530,0.117009,0.000000,0.032108,0.033775,0.000000,0.000000,0.078348,0.026527,0.027136,0.034447,0.062634,0.000000,0.077110,0.035508,0.033815,0.069698,0.049944,0.033344,0.033115,0.000000,0.000000,0.131924,0.032072,0.000000,0.032777,0.087659,0.088231,0.033716,0.062869,0.116893,0.034343,0.0,0.000000,0.031690,0.000000,0.000000,0.000000,0.036293,0.082351,0.073838,0.000000,0.024152,0.031217,0.066192,0.000000,0.035285,0.000000,0.059950,0.030270,...,0.106443,0.045964,0.000000,0.030857,0.000000,0.056605,0.064004,0.019938,0.034806,0.032232,0.031217,0.031897,0.027785,0.035734,0.061584,0.000000,0.000000,0.112601,0.387514,0.156263,0.000000,0.000000,0.027969,0.071929,0.073065,0.000000,0.123881,0.064535,0.000000,0.000000,0.032520,0.084960,0.025526,0.021799,0.094201,0.000000,0.000000,0.000000,0.031317,0.042183,0.036246,0.028048,0.000000,0.0,0.033915,0.030889,0.000000,0.072491,0.000000,0.030364,0.033558,0.077616,0.035131,0.000000,0.031502,0.000000,0.000000,0.000000,0.031367,0.026093,0.096286,0.062836,0.032852,0.057679,0.000000,0.037100,0.070000,0.295994,0.042363,0.000000,0.000000,0.112913,0.000000,0.136506,0.000000,0.072698,0.025989,0.032557,0.031217,0.032870,0.058861,0.049666,0.030632,0.000000,0.019576,0.026479,0.046432,0.000000,0.030808,0.115978,0.032286,0.000000,0.034468,0.020081,0.000000,0.000000,0.027772,0.036058,0.140031,0.025438,0.046188,0.0,0.068936,0.000000,0.050460,0.000000,0.087800,0.000000,0.031418,0.000000,0.021247,0.032648,0.000000,0.028262,0.032908,0.107615,0.000000,0.032759,0.030922,0.055182,0.032072,0.097088,0.024061,0.017580,0.127330
4,0.118486,0.099175,0.000000,0.000000,0.055789,0.066616,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.128820,0.102481,0.044769,0.162215,0.000000,0.000000,0.065912,0.000000,0.000000,0.000000,0.000000,0.064724,0.000000,0.095714,0.000000,0.000000,0.0,0.0598,0.000000,0.000000,0.000000,0.055383,0.000000,0.000000,0.000000,0.000000,0.000000,0.053073,0.057481,0.000000,0.000000,0.057763,0.099775,0.000000,0.052093,0.000000,0.059869,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.106041,0.048073,0.000000,0.230500,0.000000,0.000000,0.000000,0.000000,0.000000,0.040602,0.000000,0.000000,0.000000,0.058468,0.112848,0.000000,0.000000,0.052118,0.055935,0.000000,0.000000,0.052916,0.063118,0.058081,0.000000,0.000000,0.000000,0.047243,0.000000,0.049087,0.062313,0.000000,0.000000,0.000000,0.064232,0.000000,0.000000,0.150578,0.000000,0.179709,0.000000,0.064724,0.047729,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.056864,0.000000,0.000000,0.0,0.087096,0.000000,0.000000,0.000000,0.068852,0.065652,0.049656,0.089046,0.060179,0.131068,0.056470,0.000000,0.131305,0.000000,0.068227,0.144596,0.000000,...,0.000000,0.041574,0.000000,0.000000,0.115844,0.051197,0.000000,0.108203,0.000000,0.116612,0.056470,0.057700,0.050262,0.000000,0.000000,0.000000,0.139893,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.301592,0.000000,0.000000,0.051492,0.065695,0.000000,0.000000,0.046176,0.039433,0.000000,0.065912,0.129614,0.179606,0.000000,0.038154,0.000000,0.000000,0.052143,0.0,0.000000,0.055877,0.000000,0.000000,0.090945,0.000000,0.000000,0.000000,0.000000,0.054898,0.056986,0.000000,0.046196,0.065652,0.056742,0.000000,0.043544,0.000000,0.118854,0.000000,0.000000,0.000000,0.000000,0.066930,0.038316,0.079285,0.052118,0.000000,0.056261,0.000000,0.205975,0.043836,0.000000,0.058893,0.056470,0.000000,0.106476,0.044922,0.055412,0.000000,0.035413,0.000000,0.000000,0.000000,0.055730,0.157349,0.000000,0.000000,0.062351,0.036326,0.046096,0.051691,0.000000,0.065226,0.158318,0.046016,0.000000,0.0,0.062351,0.127736,0.000000,0.000000,0.052942,0.000000,0.000000,0.000000,0.000000,0.118118,0.000000,0.000000,0.059528,0.000000,0.049725,0.000000,0.000000,0.049911,0.058017,0.117085,0.043526,0.127203,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2840,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.151761,0.000000,0.120110,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0000,0.000000,0.000000,0.096460,0.164030,0.000000,0.000000,0.000000,0.000000,0.332383,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.172022,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.227562,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.165667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.424083,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.156569,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.162178,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.151635,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.126174,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.203349,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.139797,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.129831,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.124384,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.162679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2841,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.067214,0.080206,0.000000,0.126957,0.000000,0.000000,0.137561,0.000000,0.000000,0.000000,0.054706,0.000000,0.000000,0.249702,0.000000,0.104998,0.0,0.0000,0.000000,0.000000,0.067972,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.055383,0.000000,0.000000,0.000000,0.060278,0.052059,0.000000,0.054360,0.052303,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.049103,0.000000,0.000000,0.046957,0.534521,0.000000,0.065026,0.000000,0.060410,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.039254,0.000000,0.000000,0.000000,0.058370,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.125706,0.125888,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.061873,0.000000,0.000000,0.000000,0.000000,0.000000,0.064830,0.0,0.000000,0.000000,0.136485,0.069702,0.071849,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.142394,0.037723,0.000000,...,0.000000,0.173534,0.000000,0.000000,0.181330,0.000000,0.000000,0.037638,0.000000,0.060844,0.000000,0.000000,0.000000,0.000000,0.058126,0.000000,0.145983,0.000000,0.000000,0.000000,0.000000,0.000000,0.052796,0.000000,0.000000,0.000000,0.058463,0.060912,0.000000,0.000000,0.000000,0.000000,0.000000,0.041150,0.000000,0.068781,0.000000,0.000000,0.000000,0.039815,0.000000,0.052946,0.054413,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.132632,0.057288,0.000000,0.066816,0.000000,0.068510,0.000000,0.000000,0.045440,0.059307,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.079968,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.137232,0.049060,0.000000,0.000000,0.000000,0.037037,0.046877,0.000000,0.057495,0.073909,0.049985,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033042,0.096038,0.000000,0.0,0.000000,0.066648,0.190505,0.000000,0.000000,0.000000,0.059307,0.000000,0.080215,0.000000,0.000000,0.000000,0.000000,0.000000,0.051890,0.000000,0.000000,0.000000,0.000000,0.122182,0.000000,0.066370,0.000000
2842,0.108330,0.068006,0.119285,0.000000,0.229532,0.000000,0.000000,0.0,0.041433,0.000000,0.000000,0.000000,0.000000,0.035136,0.000000,0.083425,0.028658,0.033614,0.000000,0.038235,0.000000,0.089165,0.107844,0.000000,0.037414,0.000000,0.000000,0.000000,0.0,0.0000,0.032195,0.037167,0.066998,0.037977,0.000000,0.000000,0.047859,0.000000,0.000000,0.036393,0.000000,0.000000,0.000000,0.039609,0.034209,0.000000,0.035721,0.000000,0.000000,0.000000,0.039827,0.000000,0.000000,0.000000,0.000000,0.000000,0.131857,0.030856,0.561984,0.000000,0.000000,0.041871,0.119089,0.046239,0.027842,0.147399,0.000000,0.000000,0.000000,0.025794,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.040520,0.000000,0.131620,0.000000,0.000000,0.038847,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.041361,0.000000,0.088651,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.085201,0.0,0.000000,0.000000,0.000000,0.045802,0.000000,0.000000,0.068100,0.061061,0.000000,0.000000,0.000000,0.000000,0.045019,0.000000,0.046784,0.049576,0.000000,...,0.000000,0.114031,0.000000,0.038275,0.000000,0.000000,0.039696,0.024732,0.000000,0.079963,0.000000,0.000000,0.034465,0.000000,0.000000,0.000000,0.047963,0.000000,0.000000,0.000000,0.044554,0.042807,0.034693,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.031664,0.027040,0.175273,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.141240,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.037645,0.000000,0.043906,0.063355,0.000000,0.038909,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.052548,0.000000,0.071476,0.000000,0.000000,0.000000,0.000000,0.120236,0.032238,0.000000,0.000000,0.000000,0.097350,0.092411,0.037997,0.000000,0.024283,0.000000,0.057596,0.156811,0.000000,0.000000,0.000000,0.068354,0.000000,0.049819,0.094826,0.000000,0.000000,0.000000,0.217122,0.063108,0.028646,0.0,0.000000,0.087591,0.093887,0.000000,0.000000,0.071062,0.000000,0.000000,0.105420,0.000000,0.044269,0.070114,0.000000,0.000000,0.068195,0.000000,0.000000,0.000000,0.000000,0.060215,0.000000,0.000000,0.000000
2843,0.051154,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.083422,0.099548,0.115968,0.210097,0.000000,0.000000,0.000000,0.000000,0.143611,0.000000,0.135797,0.000000,0.000000,0.185951,0.000000,0.000000,0.0,0.0000,0.060810,0.140402,0.126546,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.149628,0.000000,0.000000,0.202408,0.000000,0.077541,0.000000,0.000000,0.000000,0.000000,0.000000,0.060944,0.068671,0.000000,0.233124,0.331710,0.000000,0.000000,0.079086,0.074978,0.000000,0.052587,0.139203,0.000000,0.000000,0.000000,0.048720,0.000000,0.000000,0.000000,0.072446,0.000000,0.000000,0.068535,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.080707,0.073373,0.269438,0.000000,0.000000,0.079226,0.000000,0.117015,0.000000,0.000000,0.000000,0.167658,0.000000,0.000000,0.075060,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.173022,0.089176,0.000000,0.000000,0.057666,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.046819,0.000000,...,0.000000,0.053845,0.078441,0.000000,0.150038,0.132620,0.000000,0.046714,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.065528,0.000000,0.000000,0.078123,0.000000,0.075600,0.066692,0.000000,0.000000,0.000000,0.000000,0.051073,0.000000,0.085367,0.000000,0.000000,0.000000,0.049416,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.119665,0.000000,0.000000,0.000000,0.000000,0.000000,0.076968,0.067568,0.087636,0.000000,0.000000,0.000000,0.000000,0.000000,0.067502,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.091937,0.000000,0.000000,0.000000,0.091732,0.000000,0.000000,0.000000,0.072181,0.000000,0.000000,0.193659,0.000000,0.094098,0.000000,0.066949,0.000000,0.000000,0.164040,0.059599,0.000000,0.0,0.000000,0.000000,0.059111,0.000000,0.000000,0.000000,0.073609,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.072446,0.000000,0.000000,0.151646,0.000000,0.123563,0.000000


In [32]:
vec.iloc

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,...,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198,199,200,201,202,203,204,205,206,207,208,209,210,211,212,213,214,215,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255,256,257,258,259,260,261,262,263,264,265,266,267,268,269,270,271,272,273,274,275,276,277,278,279,280,281,282,283,284,285,286,287,288,289,290,291,292,293,294,295,296,297,298,299
0,0.025722,0.032294,0.037764,0.080292,0.036333,0.000000,0.000000,0.0,0.039351,0.091775,0.037889,0.000000,0.041948,0.050056,0.000000,0.000000,0.000000,0.063850,0.085851,0.072628,0.000000,0.000000,0.034142,0.042152,0.035534,0.062335,0.039720,0.131057,0.0,0.0000,0.030577,0.035300,0.084842,0.000000,0.040218,0.000000,0.000000,0.041805,0.000000,0.034564,0.037435,0.038057,0.041831,0.037619,0.032490,0.039955,0.000000,0.032642,0.077980,0.082518,0.037826,0.040607,0.090387,0.090322,0.000000,0.034530,0.031308,0.029306,0.216834,0.000000,0.000000,0.000000,0.000000,0.000000,0.026443,0.000000,0.046577,0.039170,0.000000,0.024498,0.000000,0.037374,0.101827,0.000000,0.137866,0.000000,0.137848,0.000000,0.037826,0.039791,0.000000,0.038484,0.030767,0.031252,0.031969,0.081164,0.073789,0.000000,0.000000,0.000000,0.039838,0.000000,0.058839,0.039283,0.039013,0.042098,0.042152,0.124336,0.037784,0.037743,0.077230,0.000000,0.000000,0.039720,0.074066,0.103284,0.000000,0.0,0.340334,0.037334,0.000000,0.000000,0.000000,0.042757,0.097018,0.057992,0.039192,0.056907,0.036777,0.000000,0.042757,0.041569,0.000000,0.047085,0.035661,...,0.000000,0.000000,0.000000,0.036352,0.037722,0.000000,0.000000,0.046979,0.041005,0.000000,0.036777,0.037578,0.032733,0.084197,0.000000,0.042179,0.000000,0.088437,0.365224,0.092047,0.000000,0.040656,0.000000,0.084739,0.086078,0.000000,0.036486,0.076029,0.067070,0.000000,0.000000,0.000000,0.060145,0.077044,0.055489,0.000000,0.000000,0.000000,0.036895,0.049696,0.042701,0.033043,0.033959,0.0,0.000000,0.036390,0.000000,0.085402,0.236916,0.035771,0.039535,0.091439,0.000000,0.035753,0.037113,0.000000,0.000000,0.000000,0.036954,0.030740,0.056717,0.000000,0.038702,0.000000,0.000000,0.000000,0.041233,0.000000,0.024954,0.000000,0.067885,0.088682,0.036641,0.000000,0.089429,0.085646,0.030618,0.038355,0.036777,0.038724,0.092459,0.000000,0.000000,0.035882,0.023063,0.031195,0.027351,0.037233,0.000000,0.136633,0.000000,0.032460,0.040607,0.023658,0.060041,0.033664,0.032718,0.000000,0.041242,0.059937,0.054414,0.0,0.081213,0.000000,0.000000,0.039861,0.000000,0.033746,0.037013,0.000000,0.000000,0.000000,0.000000,0.066591,0.038768,0.126782,0.000000,0.000000,0.000000,0.097515,0.037784,0.076253,0.028347,0.020711,0.000000
1,0.025737,0.032313,0.037786,0.080338,0.036354,0.000000,0.000000,0.0,0.039374,0.091829,0.037911,0.000000,0.041972,0.050085,0.000000,0.000000,0.000000,0.063887,0.085901,0.072671,0.000000,0.000000,0.000000,0.042177,0.035555,0.062371,0.039744,0.131133,0.0,0.0000,0.030595,0.035320,0.084892,0.000000,0.040241,0.000000,0.000000,0.041829,0.000000,0.034585,0.037457,0.038079,0.041856,0.037641,0.032509,0.039979,0.000000,0.032661,0.078026,0.082566,0.037848,0.040630,0.090439,0.090375,0.000000,0.034550,0.031326,0.029323,0.216961,0.000000,0.000000,0.000000,0.000000,0.000000,0.026458,0.000000,0.046604,0.039193,0.000000,0.024512,0.000000,0.037396,0.101887,0.000000,0.137946,0.000000,0.137928,0.000000,0.037848,0.039814,0.000000,0.038507,0.030785,0.031270,0.031987,0.081212,0.073832,0.000000,0.000000,0.000000,0.039861,0.000000,0.058874,0.039306,0.039035,0.042123,0.042177,0.124408,0.037806,0.037765,0.077275,0.000000,0.000000,0.039744,0.074109,0.103344,0.000000,0.0,0.340532,0.037355,0.000000,0.000000,0.000000,0.042782,0.097074,0.058026,0.039215,0.056940,0.036798,0.000000,0.042782,0.041593,0.000000,0.047112,0.035682,...,0.000000,0.000000,0.000000,0.036373,0.037744,0.000000,0.000000,0.047006,0.041029,0.000000,0.036798,0.037600,0.032753,0.084246,0.000000,0.042204,0.000000,0.088488,0.365437,0.092100,0.000000,0.040680,0.000000,0.084789,0.086128,0.000000,0.036507,0.076073,0.067109,0.000000,0.000000,0.000000,0.060180,0.077089,0.055521,0.000000,0.000000,0.000000,0.036916,0.049725,0.042726,0.033063,0.033979,0.0,0.000000,0.036412,0.000000,0.085452,0.237054,0.035792,0.039558,0.091493,0.000000,0.035774,0.037134,0.000000,0.000000,0.000000,0.036975,0.030758,0.056750,0.000000,0.038725,0.000000,0.000000,0.000000,0.041257,0.000000,0.024968,0.000000,0.067925,0.088734,0.036662,0.000000,0.089481,0.085695,0.030636,0.038377,0.036798,0.038747,0.092512,0.000000,0.000000,0.035903,0.023076,0.031214,0.027367,0.037255,0.000000,0.136713,0.000000,0.032478,0.040630,0.023672,0.060076,0.033684,0.032737,0.000000,0.041267,0.059972,0.054446,0.0,0.081261,0.000000,0.000000,0.039884,0.000000,0.033765,0.037035,0.000000,0.000000,0.000000,0.000000,0.066629,0.038791,0.126856,0.000000,0.000000,0.000000,0.097572,0.037806,0.076297,0.028363,0.020723,0.000000
2,0.065470,0.027400,0.032040,0.068122,0.000000,0.000000,0.077411,0.0,0.033387,0.077865,0.032146,0.065047,0.053385,0.056626,0.074212,0.000000,0.046185,0.054172,0.000000,0.030810,0.030634,0.035925,0.000000,0.035763,0.030148,0.079331,0.033700,0.111193,0.0,0.0000,0.025943,0.029949,0.125970,0.000000,0.034122,0.083698,0.000000,0.035469,0.000000,0.087977,0.031761,0.032289,0.035491,0.000000,0.000000,0.033900,0.000000,0.055389,0.000000,0.000000,0.032093,0.068904,0.076687,0.076632,0.052000,0.029297,0.053125,0.024864,0.070758,0.033214,0.034431,0.033740,0.000000,0.111779,0.000000,0.059387,0.000000,0.033233,0.000000,0.020785,0.028371,0.031709,0.086394,0.000000,0.116970,0.035514,0.116955,0.000000,0.032093,0.033760,0.000000,0.000000,0.078312,0.026515,0.027123,0.034431,0.062605,0.000000,0.077074,0.035491,0.033800,0.069666,0.049921,0.033329,0.033100,0.000000,0.000000,0.131863,0.032058,0.000000,0.032762,0.087619,0.088190,0.033700,0.062840,0.116839,0.034327,0.0,0.000000,0.031675,0.000000,0.000000,0.000000,0.036276,0.082313,0.073804,0.000000,0.024141,0.031203,0.066161,0.000000,0.035268,0.000000,0.059923,0.030256,...,0.106394,0.045943,0.000000,0.030842,0.000000,0.056579,0.063975,0.019929,0.034790,0.032217,0.031203,0.031882,0.027772,0.035718,0.061556,0.000000,0.000000,0.112549,0.387336,0.156191,0.000000,0.000000,0.027956,0.071896,0.073032,0.000000,0.123824,0.064506,0.000000,0.000000,0.032505,0.084921,0.025515,0.021789,0.094157,0.000000,0.000000,0.000000,0.031303,0.042164,0.036229,0.028035,0.000000,0.0,0.033900,0.030875,0.000000,0.072458,0.000000,0.030350,0.033543,0.077580,0.035114,0.000000,0.031488,0.000000,0.000000,0.000000,0.031353,0.026081,0.096241,0.062807,0.032836,0.057652,0.000000,0.037082,0.069968,0.295858,0.042343,0.000000,0.000000,0.112861,0.000000,0.136444,0.000000,0.072665,0.025977,0.032542,0.031203,0.032855,0.058834,0.049643,0.030618,0.000000,0.019567,0.026467,0.046411,0.000000,0.030794,0.115924,0.032271,0.000000,0.034452,0.020072,0.000000,0.000000,0.027759,0.036041,0.139966,0.025426,0.046167,0.0,0.068904,0.000000,0.050436,0.000000,0.087759,0.000000,0.031403,0.030350,0.021237,0.032633,0.000000,0.028249,0.032892,0.107566,0.000000,0.032744,0.030907,0.055157,0.032058,0.097043,0.024050,0.017572,0.127271
3,0.065500,0.027412,0.032055,0.068153,0.000000,0.000000,0.077446,0.0,0.033402,0.077901,0.032161,0.065077,0.053409,0.056652,0.074246,0.000000,0.046207,0.054197,0.000000,0.030824,0.030648,0.035941,0.000000,0.035780,0.030162,0.079367,0.033716,0.111244,0.0,0.0000,0.025955,0.029963,0.126028,0.000000,0.034138,0.083737,0.000000,0.035485,0.000000,0.088017,0.031776,0.032303,0.035508,0.000000,0.000000,0.033915,0.000000,0.055414,0.000000,0.000000,0.032108,0.068936,0.076722,0.076667,0.052024,0.029310,0.053150,0.024875,0.070790,0.033229,0.034447,0.033755,0.000000,0.111830,0.000000,0.059414,0.000000,0.033248,0.000000,0.020794,0.028384,0.031724,0.086434,0.000000,0.117024,0.035530,0.117009,0.000000,0.032108,0.033775,0.000000,0.000000,0.078348,0.026527,0.027136,0.034447,0.062634,0.000000,0.077110,0.035508,0.033815,0.069698,0.049944,0.033344,0.033115,0.000000,0.000000,0.131924,0.032072,0.000000,0.032777,0.087659,0.088231,0.033716,0.062869,0.116893,0.034343,0.0,0.000000,0.031690,0.000000,0.000000,0.000000,0.036293,0.082351,0.073838,0.000000,0.024152,0.031217,0.066192,0.000000,0.035285,0.000000,0.059950,0.030270,...,0.106443,0.045964,0.000000,0.030857,0.000000,0.056605,0.064004,0.019938,0.034806,0.032232,0.031217,0.031897,0.027785,0.035734,0.061584,0.000000,0.000000,0.112601,0.387514,0.156263,0.000000,0.000000,0.027969,0.071929,0.073065,0.000000,0.123881,0.064535,0.000000,0.000000,0.032520,0.084960,0.025526,0.021799,0.094201,0.000000,0.000000,0.000000,0.031317,0.042183,0.036246,0.028048,0.000000,0.0,0.033915,0.030889,0.000000,0.072491,0.000000,0.030364,0.033558,0.077616,0.035131,0.000000,0.031502,0.000000,0.000000,0.000000,0.031367,0.026093,0.096286,0.062836,0.032852,0.057679,0.000000,0.037100,0.070000,0.295994,0.042363,0.000000,0.000000,0.112913,0.000000,0.136506,0.000000,0.072698,0.025989,0.032557,0.031217,0.032870,0.058861,0.049666,0.030632,0.000000,0.019576,0.026479,0.046432,0.000000,0.030808,0.115978,0.032286,0.000000,0.034468,0.020081,0.000000,0.000000,0.027772,0.036058,0.140031,0.025438,0.046188,0.0,0.068936,0.000000,0.050460,0.000000,0.087800,0.000000,0.031418,0.000000,0.021247,0.032648,0.000000,0.028262,0.032908,0.107615,0.000000,0.032759,0.030922,0.055182,0.032072,0.097088,0.024061,0.017580,0.127330
4,0.118486,0.099175,0.000000,0.000000,0.055789,0.066616,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.128820,0.102481,0.044769,0.162215,0.000000,0.000000,0.065912,0.000000,0.000000,0.000000,0.000000,0.064724,0.000000,0.095714,0.000000,0.000000,0.0,0.0598,0.000000,0.000000,0.000000,0.055383,0.000000,0.000000,0.000000,0.000000,0.000000,0.053073,0.057481,0.000000,0.000000,0.057763,0.099775,0.000000,0.052093,0.000000,0.059869,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.106041,0.048073,0.000000,0.230500,0.000000,0.000000,0.000000,0.000000,0.000000,0.040602,0.000000,0.000000,0.000000,0.058468,0.112848,0.000000,0.000000,0.052118,0.055935,0.000000,0.000000,0.052916,0.063118,0.058081,0.000000,0.000000,0.000000,0.047243,0.000000,0.049087,0.062313,0.000000,0.000000,0.000000,0.064232,0.000000,0.000000,0.150578,0.000000,0.179709,0.000000,0.064724,0.047729,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.056864,0.000000,0.000000,0.0,0.087096,0.000000,0.000000,0.000000,0.068852,0.065652,0.049656,0.089046,0.060179,0.131068,0.056470,0.000000,0.131305,0.000000,0.068227,0.144596,0.000000,...,0.000000,0.041574,0.000000,0.000000,0.115844,0.051197,0.000000,0.108203,0.000000,0.116612,0.056470,0.057700,0.050262,0.000000,0.000000,0.000000,0.139893,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.301592,0.000000,0.000000,0.051492,0.065695,0.000000,0.000000,0.046176,0.039433,0.000000,0.065912,0.129614,0.179606,0.000000,0.038154,0.000000,0.000000,0.052143,0.0,0.000000,0.055877,0.000000,0.000000,0.090945,0.000000,0.000000,0.000000,0.000000,0.054898,0.056986,0.000000,0.046196,0.065652,0.056742,0.000000,0.043544,0.000000,0.118854,0.000000,0.000000,0.000000,0.000000,0.066930,0.038316,0.079285,0.052118,0.000000,0.056261,0.000000,0.205975,0.043836,0.000000,0.058893,0.056470,0.000000,0.106476,0.044922,0.055412,0.000000,0.035413,0.000000,0.000000,0.000000,0.055730,0.157349,0.000000,0.000000,0.062351,0.036326,0.046096,0.051691,0.000000,0.065226,0.158318,0.046016,0.000000,0.0,0.062351,0.127736,0.000000,0.000000,0.052942,0.000000,0.000000,0.000000,0.000000,0.118118,0.000000,0.000000,0.059528,0.000000,0.049725,0.000000,0.000000,0.049911,0.058017,0.117085,0.043526,0.127203,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2840,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.151761,0.000000,0.120110,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0000,0.000000,0.000000,0.096460,0.164030,0.000000,0.000000,0.000000,0.000000,0.332383,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.172022,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.227562,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.165667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.424083,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.156569,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.162178,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.151635,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.126174,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.203349,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.139797,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.129831,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.124384,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.162679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2841,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.067214,0.080206,0.000000,0.126957,0.000000,0.000000,0.137561,0.000000,0.000000,0.000000,0.054706,0.000000,0.000000,0.249702,0.000000,0.104998,0.0,0.0000,0.000000,0.000000,0.067972,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.055383,0.000000,0.000000,0.000000,0.060278,0.052059,0.000000,0.054360,0.052303,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.049103,0.000000,0.000000,0.046957,0.534521,0.000000,0.065026,0.000000,0.060410,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.039254,0.000000,0.000000,0.000000,0.058370,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.125706,0.125888,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.061873,0.000000,0.000000,0.000000,0.000000,0.000000,0.064830,0.0,0.000000,0.000000,0.136485,0.069702,0.071849,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.142394,0.037723,0.000000,...,0.000000,0.173534,0.000000,0.000000,0.181330,0.000000,0.000000,0.037638,0.000000,0.060844,0.000000,0.000000,0.000000,0.000000,0.058126,0.000000,0.145983,0.000000,0.000000,0.000000,0.000000,0.000000,0.052796,0.000000,0.000000,0.000000,0.058463,0.060912,0.000000,0.000000,0.000000,0.000000,0.000000,0.041150,0.000000,0.068781,0.000000,0.000000,0.000000,0.039815,0.000000,0.052946,0.054413,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.132632,0.057288,0.000000,0.066816,0.000000,0.068510,0.000000,0.000000,0.045440,0.059307,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.079968,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.137232,0.049060,0.000000,0.000000,0.000000,0.037037,0.046877,0.000000,0.057495,0.073909,0.049985,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.033042,0.096038,0.000000,0.0,0.000000,0.066648,0.190505,0.000000,0.000000,0.000000,0.059307,0.000000,0.080215,0.000000,0.000000,0.000000,0.000000,0.000000,0.051890,0.000000,0.000000,0.000000,0.000000,0.122182,0.000000,0.066370,0.000000
2842,0.108330,0.068006,0.119285,0.000000,0.229532,0.000000,0.000000,0.0,0.041433,0.000000,0.000000,0.000000,0.000000,0.035136,0.000000,0.083425,0.028658,0.033614,0.000000,0.038235,0.000000,0.089165,0.107844,0.000000,0.037414,0.000000,0.000000,0.000000,0.0,0.0000,0.032195,0.037167,0.066998,0.037977,0.000000,0.000000,0.047859,0.000000,0.000000,0.036393,0.000000,0.000000,0.000000,0.039609,0.034209,0.000000,0.035721,0.000000,0.000000,0.000000,0.039827,0.000000,0.000000,0.000000,0.000000,0.000000,0.131857,0.030856,0.561984,0.000000,0.000000,0.041871,0.119089,0.046239,0.027842,0.147399,0.000000,0.000000,0.000000,0.025794,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.040520,0.000000,0.131620,0.000000,0.000000,0.038847,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.041361,0.000000,0.088651,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.085201,0.0,0.000000,0.000000,0.000000,0.045802,0.000000,0.000000,0.068100,0.061061,0.000000,0.000000,0.000000,0.000000,0.045019,0.000000,0.046784,0.049576,0.000000,...,0.000000,0.114031,0.000000,0.038275,0.000000,0.000000,0.039696,0.024732,0.000000,0.079963,0.000000,0.000000,0.034465,0.000000,0.000000,0.000000,0.047963,0.000000,0.000000,0.000000,0.044554,0.042807,0.034693,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.031664,0.027040,0.175273,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.141240,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.037645,0.000000,0.043906,0.063355,0.000000,0.038909,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.052548,0.000000,0.071476,0.000000,0.000000,0.000000,0.000000,0.120236,0.032238,0.000000,0.000000,0.000000,0.097350,0.092411,0.037997,0.000000,0.024283,0.000000,0.057596,0.156811,0.000000,0.000000,0.000000,0.068354,0.000000,0.049819,0.094826,0.000000,0.000000,0.000000,0.217122,0.063108,0.028646,0.0,0.000000,0.087591,0.093887,0.000000,0.000000,0.071062,0.000000,0.000000,0.105420,0.000000,0.044269,0.070114,0.000000,0.000000,0.068195,0.000000,0.000000,0.000000,0.000000,0.060215,0.000000,0.000000,0.000000
2843,0.051154,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.083422,0.099548,0.115968,0.210097,0.000000,0.000000,0.000000,0.000000,0.143611,0.000000,0.135797,0.000000,0.000000,0.185951,0.000000,0.000000,0.0,0.0000,0.060810,0.140402,0.126546,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.149628,0.000000,0.000000,0.202408,0.000000,0.077541,0.000000,0.000000,0.000000,0.000000,0.000000,0.060944,0.068671,0.000000,0.233124,0.331710,0.000000,0.000000,0.079086,0.074978,0.000000,0.052587,0.139203,0.000000,0.000000,0.000000,0.048720,0.000000,0.000000,0.000000,0.072446,0.000000,0.000000,0.068535,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.080707,0.073373,0.269438,0.000000,0.000000,0.079226,0.000000,0.117015,0.000000,0.000000,0.000000,0.167658,0.000000,0.000000,0.075060,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.173022,0.089176,0.000000,0.000000,0.057666,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.046819,0.000000,...,0.000000,0.053845,0.078441,0.000000,0.150038,0.132620,0.000000,0.046714,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.065528,0.000000,0.000000,0.078123,0.000000,0.075600,0.066692,0.000000,0.000000,0.000000,0.000000,0.051073,0.000000,0.085367,0.000000,0.000000,0.000000,0.049416,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.119665,0.000000,0.000000,0.000000,0.000000,0.000000,0.076968,0.067568,0.087636,0.000000,0.000000,0.000000,0.000000,0.000000,0.067502,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.091937,0.000000,0.000000,0.000000,0.091732,0.000000,0.000000,0.000000,0.072181,0.000000,0.000000,0.193659,0.000000,0.094098,0.000000,0.066949,0.000000,0.000000,0.164040,0.059599,0.000000,0.0,0.000000,0.000000,0.059111,0.000000,0.000000,0.000000,0.073609,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.072446,0.000000,0.000000,0.151646,0.000000,0.123563,0.000000


In [81]:
Mdf = Mdf.assign(vec = X.toarray())

ValueError: Expected a 1D array, got an array with shape (2845, 300)

In [79]:
Mdf

Unnamed: 0,lemma_str,link
0,data analyst recent graduate paypal paypal nas...,https://www.linkedin.com/jobs/view/data-analys...
1,data analyst recent graduate paypal paypal nas...,https://www.linkedin.com/jobs/view/data-analys...
2,data analyst paypal paypal nasdaq pypl believe...,https://www.linkedin.com/jobs/view/data-analys...
3,data analyst paypal paypal nasdaq pypl believe...,https://www.linkedin.com/jobs/view/data-analys...
4,entrylevel data analyst federal saving bank fe...,https://www.linkedin.com/jobs/view/entry-level...
...,...,...
2840,junior data analyst iris software inc iriss cl...,https://www.linkedin.com/jobs/view/junior-data...
2841,data analyst sql marwood group marwood group m...,https://www.linkedin.com/jobs/view/data-analys...
2842,data analyst smartsense digi join highperformi...,https://www.linkedin.com/jobs/view/data-analys...
2843,data analyst synergy search nashville berry hi...,https://www.linkedin.com/jobs/view/data-analys...


In [71]:
del Mdf['vector']

In [75]:
Mdf['vector'] = X.toarray().reshape(2845,1,300)

In [78]:
del Mdf['vector']

In [76]:
Mdf

ValueError: Shape of passed values is (3, 2, 5), indices imply (10, 3)

ValueError: Shape of passed values is (3, 2, 5), indices imply (10, 3)

In [51]:
# sum(X_train.todense()[0])

In [54]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

# Load the preprocessed data
# tweets = pd.read_csv('preprocessed_tweets.csv')

# Extract features from the text data using TF-IDF representation
vectorizer = TfidfVectorizer()
# vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['lemma_str'])

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['category'], test_size=0.2, random_state=42)

# Train the decision tree model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Evaluate the performance of the decision tree model
y_pred = dt.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Tune the hyperparameters using grid search and cross-validation
param_grid = {'max_depth': [3, 5, 10], 'min_samples_leaf': [1, 5, 10], 'criterion': ['gini', 'entropy']}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Train the decision tree model with the best hyperparameters
dt = DecisionTreeClassifier(**grid_search.best_params_)
dt.fit(X_train, y_train)

# Evaluate the performance of the tuned decision tree model
y_pred = dt.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.6268656716417911
Best hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1}
Best cross-validation score: 0.689984129783107
Accuracy: 0.6567164179104478


In [55]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

# Load the preprocessed data
# tweets = pd.read_csv('preprocessed_tweets.csv')

# Extract features from the text data using TF-IDF representation
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['lemma_str'])

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['category'], test_size=0.2, random_state=42)

# Train the random forest model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Evaluate the performance of the random forest model
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Tune the hyperparameters using grid search and cross-validation
param_grid = {'n_estimators': [100, 200, 500], 'max_depth': [3, 5, 10], 'min_samples_leaf': [1, 5, 10], 'criterion': ['gini', 'entropy']}
grid_search = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Train the random forest model with the best hyperparameters
rf = RandomForestClassifier(**grid_search.best_params_)
rf.fit(X_train, y_train)

# Evaluate the performance of the tuned random forest model
y_pred = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.6940298507462687
Best hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 1, 'n_estimators': 100}
Best cross-validation score: 0.6673073532004937
Accuracy: 0.6492537313432836


In [56]:
import mlflow
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_wine
from sklearn.metrics import accuracy_score

In [57]:
mlflow.__version__

'2.2.2'

In [58]:
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('demo-experiment')

2023/04/20 14:50:52 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/04/20 14:50:52 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/Users/ysw/Downloads/MLOps/ExperimentTracking/mlruns/1', creation_time=1682027452684, experiment_id='1', last_update_time=1682027452684, lifecycle_stage='active', name='demo-experiment', tags={}>

# One Experiment

In [None]:
with mlflow.start_run():
    # log parameters and log metrics
    # parameters: hyperparameters
    # metrics: model performance metrics

    mlflow.set_tags({"Model":"decision-tree", "Train Data": "all-data"})

    tree_depth = 5
    dt = DecisionTreeClassifier(max_depth=tree_depth)
    dt.fit(X, y)
    acc = accuracy_score(y, dt.predict(df_wine))

    mlflow.log_param("max_depth", tree_depth)
    mlflow.log_metric("accuracy", acc)

mlflow.end_run()

In [59]:
with mlflow.start_run():
#     from sklearn.naive_bayes import MultinomialNB
    mlflow.set_tags({"Model":"A", "Train Data": "all-data"})
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['lemma_str'])

    # Split the dataset into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, df['category'], test_size=0.2, random_state=42)

    # Train a Naive Bayes classifier on the training set
    nb = MultinomialNB(alpha=1)
    nb.fit(X_train, y_train)

    # Evaluate the performance of the trained model on the test set
    y_pred = nb.predict(X_test)
#     print("Accuracy:", accuracy_score(y_test, y_pred))
    acc = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", acc)
mlflow.end_run()

# Hyperparameters

In [None]:
ntrees = [20,40,60,80,100]
mtrys = [3,4,5]
for i in ntrees:
    for j in mtrys:
        with mlflow.start_run():
            mlflow.set_tags({"Model":"random-forest", "Train Data": "all-data"})

            mlflow.log_params({'n_estimators':i, 'max_features':j})

            rf = RandomForestClassifier(n_estimators = i, max_features = j, oob_score = True)
            rf.fit(X,y)
            acc = rf.oob_score_
            #acc = accuracy_score(y, rf.predict(X))
            mlflow.log_metric('accuracy', acc)
        mlflow.end_run()

In [None]:
'n_estimators': [100, 200, 500], 'max_depth': [3, 5, 10], 'min_samples_leaf': [1, 5, 10], 'criterion': ['gini', 'entropy']

In [98]:
ntrees = [100,200,500]
mtrys = [3,4,5,10]
min_samples_leaf = [1, 5, 10]
for i in ntrees:
    for j in mtrys:
        for k in min_samples_leaf:
            with mlflow.start_run():
            #     from sklearn.naive_bayes import MultinomialNB
                mlflow.set_tags({"Model":"random-forest", "Train Data": "all-data"})
                mlflow.log_params({'n_estimators':i, 'max_features':j, 'min_samples_leaf':k})

                vectorizer = CountVectorizer()
                X = vectorizer.fit_transform(df['lemma_str'])

                # Split the dataset into training and test sets
                X_train, X_test, y_train, y_test = train_test_split(X, df['category'], test_size=0.2, random_state=42)

                # Train a Naive Bayes classifier on the training set
            #     nb = MultinomialNB(alpha=1)
            #     nb.fit(X_train, y_train)
                rf = RandomForestClassifier(n_estimators = i, max_features = j, min_samples_leaf = k, oob_score = True)
                rf.fit(X_train,y_train)

                # Evaluate the performance of the trained model on the test set
                y_pred = rf.predict(X_test)
            #     print("Accuracy:", accuracy_score(y_test, y_pred))
                acc = accuracy_score(y_test, y_pred)
                mlflow.log_metric("accuracy", acc)
            mlflow.end_run()

In [66]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['lemma_str'])

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['category'], test_size=0.2, random_state=42)

# Train a Naive Bayes classifier on the training set
#     nb = MultinomialNB(alpha=1)
#     nb.fit(X_train, y_train)
rf = RandomForestClassifier(n_estimators = 200, max_features = 10, min_samples_leaf = 5, oob_score = True)
rf.fit(X_train,y_train)

# Evaluate the performance of the trained model on the test set
y_pred = rf.predict(X_test)
#     print("Accuracy:", accuracy_score(y_test, y_pred))
acc = accuracy_score(y_test, y_pred)

In [72]:
y_pred

array(['POSITIVE', 'POSITIVE'], dtype=object)

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

# Load the preprocessed data
# tweets = pd.read_csv('preprocessed_tweets.csv')

# Extract features from the text data using TF-IDF representation
vectorizer = TfidfVectorizer()
# vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['lemma_str'])

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['category'], test_size=0.2, random_state=42)

# Train the decision tree model
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

# Evaluate the performance of the decision tree model
y_pred = dt.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# Tune the hyperparameters using grid search and cross-validation
param_grid = {'max_depth': [3, 5, 10], 'min_samples_leaf': [1, 5, 10], 'criterion': ['gini', 'entropy']}
grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)

# Train the decision tree model with the best hyperparameters
dt = DecisionTreeClassifier(**grid_search.best_params_)
dt.fit(X_train, y_train)

# Evaluate the performance of the tuned decision tree model
y_pred = dt.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))