In [40]:
import spacy
import pandas as pd
from tqdm import tqdm as tq

In [41]:
# Load the dataset to see its structure
url_data = 'https://raw.githubusercontent.com/TeodorRusKvi/Tekstanalyse/main/git_NLP_data/'

df = pd.read_csv(url_data+ 'file_name.csv')
df.head()

Unnamed: 0,Title,Political Lean,Score,Id,Subreddit,URL,Num of Comments,Text,Date Created
0,"No matter who someone is, how they look like, ...",Liberal,1,t5fybt,socialism,https://v.redd.it/ng5fyl7hp2l81,0,,1646272000.0
1,Biden speech draws 38.2 million U.S. TV viewers,Liberal,6,t5fqdn,democrats,https://www.reuters.com/world/us/biden-speech-...,1,,1646271000.0
2,State of the union,Liberal,1,t5fj9a,DemocraticSocialism,https://www.reddit.com/r/DemocraticSocialism/c...,1,Who watched the state of the union last night ...,1646270000.0
3,We Should Just Give Poor People Money,Liberal,7,t5f7n9,SocialDemocracy,https://youtu.be/a80kRjpubG0,3,,1646270000.0
4,Do it for the Dew,Liberal,6,t5es2c,democrats,https://i.redd.it/drmunn90f2l81.jpg,1,,1646268000.0


## Exploring the data

In [42]:
# Analyze the 'Subreddit' column
subreddit_counts = df['Subreddit'].value_counts()

# Number of unique subreddits
unique_subreddits = len(subreddit_counts)

# Display the number of unique subreddits and the first few rows of the distribution
print(f'Unique Subreddits: {unique_subreddits}')
subreddit_counts

Unique Subreddits: 15


Subreddit
conservatives          1000
SocialDemocracy         997
alltheleft              997
socialism               975
Libertarian             975
Capitalism              975
progressive             974
republicans             948
democrats               941
feminisms               935
DemocraticSocialism     922
Liberal                 904
anarchocapitalism       637
Communist               574
RadicalFeminism         100
Name: count, dtype: int64

In [43]:
political_lean_counts = df['Political Lean'].value_counts()
political_lean_counts

Political Lean
Liberal         8319
Conservative    4535
Name: count, dtype: int64

In [44]:
# Analyzing the structure of the IDs to see if there's a discernible pattern
text_column = df['Text'].isnull()
df['Text'] = df['Text'].fillna(0)
text_column.value_counts()

Text
True     10426
False     2428
Name: count, dtype: int64

In [45]:
# Convert the 'Text' column to string type
df['Text'] = df['Text'].astype(str)

# Concatenate the 'Title' and 'Text' columns
df['All_text'] = df['Title'] + ' ' + df['Text']

# Create a new dataframe with only the 'All_text', 'Political Lean', and 'Subreddit' columns
new_df = df[['All_text', 'Political Lean', 'Subreddit']].copy()

In [48]:
# Load the dataset to see its structure
url_data = 'https://raw.githubusercontent.com/TeodorRusKvi/Tekstanalyse/main/git_NLP_data/'

df = pd.read_csv(url_data + 'new_df.csv')

In [52]:
tq.pandas(desc='Processing')
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Define a list of stopwords (this is just an example, ensure to load or define a comprehensive list)
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Function to tokenize and lemmatize a single text, while removing stopwords and keeping certain abbreviations
def tokenize_and_lemmatize_text(text):
    doc = nlp(text)  # Process the text with spaCy
    lemmatized_tokens = []
    for token in (doc):
        # Check if the token matches specific abbreviations or consists of alphabetic characters
        if token.text in ['U.S.', 'U.S.A.'] or (token.is_alpha and token.lemma_.lower() not in stop_words):
            # Directly add the abbreviations or lemmatize and lower-case other tokens
            token_to_add = token.text if token.text in ['U.S.', 'U.S.A.'] else token.lemma_.lower()
            lemmatized_tokens.append(token_to_add)
    return ' '.join(lemmatized_tokens)


# Apply the tokenization and lemmatization function to each row in the column
df['Processed'] = df['All_text'].progress_apply(tokenize_and_lemmatize_text)

# Show the first few rows to verify the changes
df

Processing: 100%|██████████| 12854/12854 [01:48<00:00, 118.22it/s]


Unnamed: 0,All_text,Political_Lean,Subreddit,subreddit_encoded,Processed,Dependency_Tags,POS_Tags,Named_Entities
0,"No matter who someone is, how they look like, ...",Liberal,socialism,14,matter look like language speak wear remember ...,"['advmod', 'ROOT', 'prep', 'compound', 'compou...","['ADV', 'VERB', 'ADP', 'NOUN', 'NOUN', 'NOUN',...",[]
1,Biden speech draws 38.2 million US TV viewers 0,Liberal,democrats,10,biden speech draw million tv viewer,"['compound', 'nsubj', 'ROOT', 'nummod', 'compo...","['PROPN', 'NOUN', 'VERB', 'NUM', 'PROPN', 'NOU...","['biden speech draw million (ORG)', 'U.S. (GPE)']"
2,State of the union Who watched the state of th...,Liberal,DemocraticSocialism,2,state union watch state union night opinion,"['compound', 'nsubj', 'ROOT', 'compound', 'com...","['NOUN', 'PROPN', 'VERB', 'NOUN', 'PROPN', 'NO...",['state union watch state union (ORG)']
3,We Should Just Give Poor People Money 0,Liberal,SocialDemocracy,6,poor people money,"['amod', 'compound', 'ROOT']","['ADJ', 'NOUN', 'NOUN']",[]
4,Do it for the Dew 0,Liberal,democrats,10,dew,['ROOT'],['NOUN'],[]
...,...,...,...,...,...,...,...,...
12849,Ron Paul’s Spirited Defense of WikiLeaks & Fre...,Conservative,anarchocapitalism,8,ron paul spirited defense wikileaks free infor...,"['compound', 'nsubj', 'amod', 'compound', 'nsu...","['PROPN', 'PROPN', 'VERB', 'NOUN', 'NOUN', 'AD...",['ron paul (PERSON)']
12850,"“Anarcho-capitalism, in my opinion, is a doctr...",Conservative,anarchocapitalism,8,anarcho capitalism opinion doctrinal system im...,"['nmod', 'compound', 'compound', 'amod', 'nsub...","['PROPN', 'PROPN', 'NOUN', 'ADJ', 'NOUN', 'VER...",[]
12851,Mises Wiki is a wiki project dedicated to the ...,Conservative,anarchocapitalism,8,mises wiki wiki project dedicate advancement a...,"['ROOT', 'compound', 'compound', 'nmod', 'amod...","['VERB', 'ADJ', 'NOUN', 'NOUN', 'NOUN', 'NOUN'...","['mises wiki wiki (PERSON)', 'austrian (NORP)'..."
12852,Fireman Protection Monopoly - Is This Failed C...,Conservative,anarchocapitalism,8,fireman protection monopoly failed capitalism,"['compound', 'compound', 'nsubj', 'ROOT', 'dobj']","['PROPN', 'PROPN', 'NOUN', 'VERB', 'NOUN']",['fireman protection monopoly (ORG)']


In [53]:
df['Processed'] = df['Processed'].replace(['U.S.', 'U.S.A.'], 'US', regex=True)

In [55]:
def extract_text_features(text):
    doc = nlp(text)
    dependency_tags = [token.dep_ for token in doc]
    pos_tags = [token.pos_ for token in doc]
    
    # Extract named entities, ensure to join multiple-word entities, and represent them with their labels
    named_entities = [f"{ent.text} ({ent.label_})" for ent in doc.ents]
    
    return dependency_tags, pos_tags, named_entities

# Apply the function to the 'Processed' column and create new columns for each feature
df['Dependency_Tags'], df['POS_Tags'], df['Named_Entities'] = zip(*df['Processed'].apply(extract_text_features))

# Show the first few rows to verify the new columns

KeyboardInterrupt: 

In [56]:
df.head()

Unnamed: 0,All_text,Political_Lean,Subreddit,subreddit_encoded,Processed,Dependency_Tags,POS_Tags,Named_Entities
0,"No matter who someone is, how they look like, ...",Liberal,socialism,14,matter look like language speak wear remember ...,"[advmod, ROOT, prep, compound, compound, pobj,...","[ADV, VERB, ADP, NOUN, NOUN, NOUN, VERB, ADJ, ...",[]
1,Biden speech draws 38.2 million US TV viewers 0,Liberal,democrats,10,biden speech draw million tv viewer,"[compound, nsubj, ROOT, nummod, compound, dobj]","[PROPN, NOUN, VERB, NUM, NOUN, NOUN]",[biden speech draw million tv (ORG)]
2,State of the union Who watched the state of th...,Liberal,DemocraticSocialism,2,state union watch state union night opinion,"[compound, nsubj, ROOT, compound, compound, co...","[NOUN, PROPN, VERB, NOUN, PROPN, NOUN, NOUN]",[state union watch state union (ORG)]
3,We Should Just Give Poor People Money 0,Liberal,SocialDemocracy,6,poor people money,"[amod, compound, ROOT]","[ADJ, NOUN, NOUN]",[]
4,Do it for the Dew 0,Liberal,democrats,10,dew,[ROOT],[NOUN],[]
