In [3]:
import pandas as pd

# INDICATE ACTIVITY NUMBER
ACT_NUM = 5

## Filter CL207 transcripts only

In [24]:


# read file from a folder
df = pd.read_csv(f'act{ACT_NUM}/dataset_all.csv')

# filter transcript of only CL207
df = df[df.speaker == 'CL207']

# drop speaker column, leave only the notes
df = df.drop(columns=['speaker'])

# rename the 'text' column to 'Transcript'
df.rename(columns={'text': 'Transcript'}, inplace=True)

df.head(10)

Unnamed: 0,Transcript
0,Put the needle on the record
6,"Uh, doing very well"
7,"Yeah, I think I'm just generally pretty happy,..."
8,"So yeah, always, always positive"
11,So I got in 2018
12,"Actually, it was kind of a because I was horny"
13,There's this good this girl in my college was ...
14,"I was like, no, no fucking clue what the fuck ..."
15,"But you know, her, her thighs were crazy"
16,I just followed her to the blockchain club


## Clean sentences and combine

In [25]:
df["Transcript"] = df["Transcript"].str.replace('"','')
df.head(10)

Unnamed: 0,Transcript
0,Put the needle on the record
6,"Uh, doing very well"
7,"Yeah, I think I'm just generally pretty happy,..."
8,"So yeah, always, always positive"
11,So I got in 2018
12,"Actually, it was kind of a because I was horny"
13,There's this good this girl in my college was ...
14,"I was like, no, no fucking clue what the fuck ..."
15,"But you know, her, her thighs were crazy"
16,I just followed her to the blockchain club


In [26]:
# Save the modified file
# df.to_csv(f"../scraped-data/act{ACT_NUM}/dataset_all.csv", index=False)

## Manual Clean

## Adding Keywords

In [7]:
# load cleaned file
cleaned_df = pd.read_csv(f'../scraped-data/act{ACT_NUM}/man-clean-file.csv', 
                         encoding='ISO-8859-1')

# Add "Keywords" column with None values
cleaned_df['Keywords'] = None

cleaned_df.head(10)

Unnamed: 0,Transcript,Keywords
0,"CL207 got in in 2018, CL207 started crypto bec...",
1,A girl from blockchain club told CL207 XRP was...,
2,CL207 bought like two grand of Bitcoin. Then i...,
3,CL207 need to start using a ton of leverage. H...,
4,"So when CL207 was one, my mom took CL207 to a ...",
5,He needs a cute nickname or evil will try to g...,
6,So she started calling CL207 cat in Chinese.,
7,"Overtime, COVID happened, and CL207 had to put...",
8,CL207 chose yellow for the hazmat suit. It is ...,
9,CL207 think not near 99% of my gains is like B...,


In [11]:
keywords_dict = {
    'Crypto': [
        'btc', 'bitcoin', 'eth', 'ethereum', 'altcoin', 
        'token', 'meme coins', 'crpyto', 'fork', 'binance',
        'leverage', 'Governance Token', 'volatile', 'sol', 'solana',
        'ripple', 'XRP', 'doge'
        ],
    'DeFi': [
       'defi', 'uniswap', 'UNI', 'unisock', 'SOX', 'aave', 'dex', 'slippage',
       'swap', 'yield', 'yield farming', 'staking', 'liquidity mining',
       'lend', 'collateral', 'loan', 'E-girl', 'AVI', 'SNX'
        ],
    'CL207': [
        'CL207', 'CL', 'CL20', 'Anonymous', 'Cat' 'twitter', 
        'gaming', 'read', 'books', 'hazmat', 'writer'
        ],
    'General': [
        'Investment', 'Market', 'Technology', 'Finance', 'Innovation',
        'Digital Currency', 'Economics', 'Trade', 'Tokenization',
        'Regulation', 'Security', 'Privacy', 'Scalability', 'Governance',
        'Protocol', 'Network', 'Decentralization', 
        'trading', 'liquidation', 'liquidated', 'strategy', 
        'opportunities', 'market', 'stonks', 'vulnerabilities', 'stocks', 'bond',
        'net worth'
    ],
    'Politics': [
        'politics', 'Trump'
    ],
    'Education': [
        'education', 'school'
    ],
    'Health': [
        'broccoli', 'berry', 'sugar', 'health'
    ],
}


def tag_keywords(text):
    tags = []
    for category, keywords in keywords_dict.items():
        if any(keyword.lower() in text.lower() for keyword in keywords):
            tags.append(category)
    return ', '.join(tags)

In [13]:
cleaned_df['Keywords'] = cleaned_df['Transcript'].apply(tag_keywords)

# df.head(10)
cleaned_df.to_csv(f"../cleaned-data/activity-{ACT_NUM}/cl207_content_with_keywords_act{ACT_NUM}.csv", index=False)

print('Cleaned file saved!')

Cleaned file saved!


In [15]:
txt_file_path = f"../cleaned-data/activity-{ACT_NUM}/cl207_content_with_keywords_act{ACT_NUM}.csv".replace('.csv', '.txt')

with open(txt_file_path, 'w', newline='\n') as txt_file:
    # Write the header
    txt_file.write("Content\tKeywords\n")
    for i, row in cleaned_df.iterrows():
        content = row.get('Transcript', 'No Content Found')
        keywords = row.get('Keywords', 'No Keywords Found')
        txt_file.write(f"{content}\t{keywords}\n")