**Topic of Corpus : How AI Impacts the Job Market**

#1 Combine all 5 CSV files (comments from 5 videos) into one panda DF

In [1]:
import pandas as pd
import glob

In [3]:
csv_files = glob.glob("./CleanedCommentsV2/*.csv")
dfs = []

In [4]:
for csv_file in csv_files:
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file, sep=';')
    # Append the DataFrame to the list
    dfs.append(df)

In [5]:
dataset = pd.concat(dfs, ignore_index=True)

In [6]:
dataset.head()

Unnamed: 0,id,name,text,time,likes,reply,CleanedText,ProcessText,ActualLabel
0,UCkM3xW7cOcRekISepJFCNew,@davidbreier84,"I think for the foreseeable future (or say, a ...",2023-05-07T04:22:57Z,267,,"I think for the foreseeable future or say, a f...",think foreseeable future say year least not af...,Negative
1,UC9utSJU6hMcwBdwsW1fmuNg,@thedespoiler,50 years ago people were thinking the hard lab...,2023-05-30T15:06:34Z,110,,50 years ago people were thinking the hard lab...,year ago people think hard labor job automate ...,Negative
2,UCrZdNj8aV8FkOJCVVgcD05A,@fenrirgg,"Yes, don&#39;t worry about AI. Everything&#39;...",2023-05-12T11:40:59Z,52,UCkM3xW7cOcRekISepJFCNew,"Yes, do not worry about AI. Everything .going ...",yes not worry ai everything .going alright ai,Positive
3,UCvutfU03mkx7MHBHopWTafw,@TheWeirdo879,I just graduated college with a CS degree and ...,2023-05-17T19:32:33Z,21,,I just graduated from college with a CS degree...,graduate college c degree say chatgpt useful c...,Positive
4,UCaBPZn_-UVQP-PsngtTc0dA,@megleyd,If a company goes out of business those folks ...,2023-05-12T20:40:05Z,15,UCkM3xW7cOcRekISepJFCNew,If a company goes out of business those folks ...,company go business folk not die free labor ta...,Neutral


In [10]:
# Remove rows where the content of the "CleanedText" column is NaN, empty, or has the number 0
dataset = dataset.dropna(subset=['ProcessText'])
dataset = dataset.dropna(subset=['ActualLabel'])
dataset = dataset[
    (dataset['CleanedText'] != '') &
    (dataset['CleanedText'] != '0') &
    (dataset['ActualLabel'] != 'N/A') &
    (dataset['ActualLabel'] != 'Proof?') &
    (~dataset['CleanedText'].str.contains('tina', case=False, na=False)) &
    (~dataset['CleanedText'].str.contains('Tina', case=False, na=False))
]


# Reset the index after removing rows
dataset.reset_index(drop=True, inplace=True)

In [11]:
dataset.iloc[0]

id                                      UCkM3xW7cOcRekISepJFCNew
name                                              @davidbreier84
text           I think for the foreseeable future (or say, a ...
time                                        2023-05-07T04:22:57Z
likes                                                        267
reply                                                        NaN
CleanedText    I think for the foreseeable future or say, a f...
ProcessText    think foreseeable future say year least not af...
ActualLabel                                             Negative
Name: 0, dtype: object

Cleaned Text is the cleaned scrapped comment before undergoing tokenization, lemmatization, and almost all stopword removal

ProcessText is the lemmatized, and no stopword version of the comment


In [15]:
import gensim
from gensim.utils import simple_preprocess

In [16]:
texts = dataset["ProcessText"].apply(lambda x: simple_preprocess(x, deacc=True))

In [17]:
texts = texts.tolist()

In [18]:
bigram = gensim.models.Phrases(texts)

In [19]:
texts = [bigram[doc] for doc in texts]

In [20]:
bigram = gensim.models.Phrases(texts)
texts = [bigram[line] for line in texts]
texts

[['think',
  'foreseeable',
  'future',
  'say',
  'year',
  'least',
  'not',
  'afraid',
  'far',
  'understand',
  'big',
  'problem',
  'complete',
  'objective',
  'matter',
  'mean',
  'perceive',
  'cheat',
  'valid',
  'fast',
  'way',
  'half',
  'assed',
  'ibm',
  'train',
  'watson',
  'medical',
  'cancer',
  'analysis',
  'image',
  'data',
  'take',
  'think',
  'mri',
  'scan',
  'model',
  'produce',
  'fantastic',
  'result',
  'end',
  'find',
  'not',
  'able',
  'detect',
  'cancer',
  'recognize',
  'machine',
  'image',
  'take',
  'coincidentally',
  'machine',
  'cancer',
  'patient',
  'ferry',
  'computer',
  'not',
  'give',
  'shit',
  'deeper',
  'reasoning',
  'completes',
  'fairly',
  'simple',
  'objective',
  'sure',
  'compile',
  'generate',
  'document',
  'record',
  'time',
  'drive',
  'company',
  'business',
  'company',
  'fairly',
  'procedural',
  'content',
  'begin',
  'plan',
  'reason',
  'involve',
  'not',
  'think',
  'usurp',
  'sho

In [22]:
from gensim import corpora

In [23]:
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=3)
print(dictionary.token2id)

{'able': 0, 'afraid': 1, 'analysis': 2, 'begin': 3, 'big': 4, 'business': 5, 'cancer': 6, 'cheat': 7, 'company': 8, 'complete': 9, 'computer': 10, 'content': 11, 'data': 12, 'document': 13, 'drive': 14, 'end': 15, 'far': 16, 'fast': 17, 'find': 18, 'future': 19, 'generate': 20, 'give': 21, 'half': 22, 'image': 23, 'involve': 24, 'least': 25, 'machine': 26, 'matter': 27, 'mean': 28, 'medical': 29, 'model': 30, 'not': 31, 'plan': 32, 'problem': 33, 'produce': 34, 'reason': 35, 'reasoning': 36, 'result': 37, 'say': 38, 'shit': 39, 'short': 40, 'simple': 41, 'sure': 42, 'take': 43, 'term': 44, 'think': 45, 'time': 46, 'train': 47, 'understand': 48, 'way': 49, 'year': 50, 'art': 51, 'artist': 52, 'automate': 53, 'creative': 54, 'desk': 55, 'exist': 56, 'hard_labor': 57, 'human': 58, 'job': 59, 'people': 60, 'right': 61, 'still': 62, 'work': 63, 'year_ago': 64, 'ai': 65, 'everything': 66, 'going': 67, 'worry': 68, 'yes': 69, 'chatgpt': 70, 'college': 71, 'create': 72, 'degree': 73, 'get': 74

In [24]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 3),
  (7, 1),
  (8, 2),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 2),
  (24, 1),
  (25, 1),
  (26, 2),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 4),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 2),
  (44, 1),
  (45, 3),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1)],
 [(43, 1),
  (45, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 2),
  (58, 1),
  (59, 4),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1)],
 [(31, 1), (65, 2), (66, 1), (67, 1), (68, 1), (69, 1)],
 [(18, 1),
  (38, 1),
  (49, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1)],
 [(5, 1),
  (8, 1),
  (31, 1),
  (81, 1),
  (82, 1),
  (83,

In [26]:
corpora.MmCorpus.serialize('corpus.mm', corpus)