In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install requests zstandard tqdm

Collecting zstandard
  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: zstandard
Successfully installed zstandard-0.22.0


In [None]:
import requests
import zstandard as zstd
import pandas as pd
import os
from tqdm.auto import tqdm
import json

# Function to download a file
def download_file(url, filename):
  if os.path.exists(filename):
    return
  response = requests.get(url, stream=True)
  with open(filename, "wb") as f:
    f.write(response.content)

# Function to decompress a ZST File
def decompress_zst(input_file, output_file):
  if os.path.exists(output_file):
    return
  decompressor = zstd.ZstdDecompressor()
  with open(input_file, "rb") as ifh:
    with open(output_file, "wb") as ofh:
      decompressor.copy_stream(ifh, ofh)

# input file is jsonl in utf8 encoding
# keep condition is a lamba accepting a post, and returns boolean whether to include it in dataframe or not.
# returns a pandas dataframe
def make_dataset(input_file, output_file, keep_condition= None):
  if os.path.exists(output_file):
    return

  posts = []
  count = 0
  with open(input_file, "r", encoding='utf8') as file:
    for line in tqdm(file):
        post = json.loads(line)
        if keep_condition(post):
            posts.append(post)
            count += 1
        if count == 50000:  # limiter for 50,000 maximum
            break
  df = pd.DataFrame(posts)
  df = df[['selftext']]
  df.rename(columns= {'selftext': 'document'} , inplace=True)
  df=df.drop_duplicates().dropna().reset_index(drop=True)
  df.to_csv(output_file, index=False)
  return df

In [None]:
"""
The following URLs were collected for train/testing of our models.
Shia, Politics, and exmuslim were included for the Annotated set.
"""
# "https://the-eye.eu/redarcs/files/islam_submissions.zst",
# "https://the-eye.eu/redarcs/files/technology_submissions.zst",
# "https://the-eye.eu/redarcs/files/science_submissions.zst",
# "https://the-eye.eu/redarcs/files/food_submissions.zst",
# "https://the-eye.eu/redarcs/files/gaming_submissions.zst",
# "https://the-eye.eu/redarcs/files/travel_submissions.zst",
# "https://the-eye.eu/redarcs/files/personalfinance_submissions.zst",
# "https://the-eye.eu/redarcs/files/sports_submissions.zst",
# "https://the-eye.eu/redarcs/files/television_submissions.zst",
# "https://the-eye.eu/redarcs/files/Fitness_submissions.zst",
# "https://the-eye.eu/redarcs/files/AskReddit_submissions.zst",
# "https://the-eye.eu/redarcs/files/atheism_submissions.zst",
# "https://the-eye.eu/redarcs/files/Christianity_submissions.zst",
# "https://the-eye.eu/redarcs/files/hinduism_submissions.zst",
# "https://the-eye.eu/redarcs/files/Muslim_submissions.zst",
# "https://the-eye.eu/redarcs/files/Hijabis_submissions.zst",
# "https://the-eye.eu/redarcs/files/MuslimMarriage_submissions.zst",
# "https://the-eye.eu/redarcs/files/exmuslim_submissions.zst",
#  "https://the-eye.eu/redarcs/files/Pets_submissions.zst",
# "https://the-eye.eu/redarcs/files/shia_submissions.zst",
# "https://the-eye.eu/redarcs/files/politics_submissions.zst"

archive_urls = [
    "https://the-eye.eu/redarcs/files/islam_submissions.zst",
    "https://the-eye.eu/redarcs/files/technology_submissions.zst",
    "https://the-eye.eu/redarcs/files/science_submissions.zst",
    "https://the-eye.eu/redarcs/files/food_submissions.zst",
    "https://the-eye.eu/redarcs/files/gaming_submissions.zst",
    "https://the-eye.eu/redarcs/files/travel_submissions.zst",
    "https://the-eye.eu/redarcs/files/personalfinance_submissions.zst",
    "https://the-eye.eu/redarcs/files/sports_submissions.zst",
    "https://the-eye.eu/redarcs/files/television_submissions.zst",
    "https://the-eye.eu/redarcs/files/Fitness_submissions.zst",
    "https://the-eye.eu/redarcs/files/AskReddit_submissions.zst",
    "https://the-eye.eu/redarcs/files/atheism_submissions.zst",
    "https://the-eye.eu/redarcs/files/Christianity_submissions.zst",
    "https://the-eye.eu/redarcs/files/hinduism_submissions.zst",
    "https://the-eye.eu/redarcs/files/Muslim_submissions.zst",
    "https://the-eye.eu/redarcs/files/Hijabis_submissions.zst",
    "https://the-eye.eu/redarcs/files/MuslimMarriage_submissions.zst",
    "https://the-eye.eu/redarcs/files/Pets_submissions.zst",
]


archive_names = []

for url in archive_urls:
    archive_name = url.split("/")[-1]
    archive_names.append(archive_name)

archive_names

['islam_submissions.zst',
 'technology_submissions.zst',
 'science_submissions.zst',
 'food_submissions.zst',
 'gaming_submissions.zst',
 'travel_submissions.zst',
 'personalfinance_submissions.zst',
 'sports_submissions.zst',
 'television_submissions.zst',
 'Fitness_submissions.zst',
 'AskReddit_submissions.zst',
 'atheism_submissions.zst',
 'Christianity_submissions.zst',
 'hinduism_submissions.zst',
 'Muslim_submissions.zst',
 'Hijabis_submissions.zst',
 'MuslimMarriage_submissions.zst',
 'exmuslim_submissions.zst',
 'Pets_submissions.zst',
 'politics_submissions.zst']

In [None]:
import os

text_threshold = 500
keep_post = lambda post : 'selftext' in post and len(post['selftext']) > text_threshold

# The following code is optimized to the degree that it will detect at various points, which files exist,
# and only download/decompress/create those subreddits which are missing from your temporary working directory
for index, archive_url in enumerate(archive_urls):
    decompressed_name = os.path.splitext(archive_names[index])[0] + '.txt'
    stripped_archive_name = archive_names[index].rstrip(".zst")
    results_name = f"{stripped_archive_name}.csv"

    if not os.path.exists(results_name):
        download_file(archive_url, archive_names[index])
        decompress_zst(archive_names[index], decompressed_name)
        df = make_dataset(decompressed_name, results_name, keep_post)

        # To prevent disk-space from running out
        os.remove(f"{stripped_archive_name}s.txt")
        os.remove(f"{stripped_archive_name}s.zst")

0it [00:00, ?it/s]

## Generating a Random Sample of Documents

Code for randomly sampling an equal number of documents from each .csv file in a given directory

In [None]:
import os
import pandas as pd
import random

# Set the directory where your CSV files are located
directory_path = '/content'

# Initialize an empty DataFrame to store the combined data
combined_df = pd.DataFrame()
csv_files = [f for f in os.listdir(directory_path) if f.endswith('.csv')]

# Check if there are CSV files in the directory
if not csv_files:
    print("No CSV files found in the directory.")
else:
    # Read and concatenate all CSV files
    for csv_file in csv_files:
        file_path = os.path.join(directory_path, csv_file)
        df = pd.read_csv(file_path)
        df_extract = df.sample(n=min(int(50000//len(csv_files)), len(df)), random_state=42)
        combined_df = pd.concat([combined_df, df_extract["document"]], ignore_index=True)

    # Shuffle the rows randomly
    combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # Save the random sample to a new CSV file
    combined_df.rename(columns={'selftext': 'document'}, inplace=True)
    combined_df.to_csv('random_sample.csv', index=False)
    print("Random sample of 50,000 rows extracted and saved as 'random_sample.csv'.")


Random sample of 50,000 rows extracted and saved as 'random_sample.csv'.


In [None]:
pd.read_csv("/content/random_sample.csv")

Unnamed: 0,0
0,Does anyone know a way to reduce lag when play...
1,"I have friends from all over the world, and it..."
2,I'll be 30 years old when I go back to school ...
3,"I am using a netbook, I picked it up to move i..."
4,"I live in a town called Peace, girls were foun..."
...,...
48089,It seemed like a couple of years ago manufactu...
48090,Hello all.\n\nMe and my wife want to take a we...
48091,So I do some work for imachickenrancher.com - ...
48092,It has been an eventful half-season for Power ...


## Generating a Zip

In [None]:
import zipfile

# Directory containing the CSV files
folder_path = "/content"
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Create a zip file to store the CSV files
zip_file_name = "Reddit-All-20-raw.zip"

# Create the zip file
with zipfile.ZipFile(zip_file_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for csv_file in csv_files:
        file_path = os.path.join(folder_path, csv_file)
        zipf.write(file_path, arcname=os.path.basename(file_path))

## Code for cleaning and replacing DFs which haven't had dups and nulls removed

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/FYP/Data/Reddit-Islam-raw.csv")
print("Number of rows before:", df.shape[0])
df=df.drop_duplicates().dropna().reset_index(drop=True)
print("Number of rows after dropping duplicates and nulls:", df.shape[0])

Number of rows before: 39998
Number of rows after dropping duplicates and nulls: 39998


In [None]:
df.to_csv("/content/drive/MyDrive/FYP/Data/Reddit-Islam-raw.csv", index=False)

# Generating Test/Train Data for Non-Annotated Dataset

In [None]:
!gdown 1tHCIhmav_aTyWL89L3t5pd_7YVXhqWIQ
!unzip Reddit-All-20-raw.zip

Downloading...
From: https://drive.google.com/uc?id=1tHCIhmav_aTyWL89L3t5pd_7YVXhqWIQ
To: /content/Reddit-All-20-raw.zip
100% 489M/489M [00:05<00:00, 93.5MB/s]
Archive:  Reddit-All-20-raw.zip
  inflating: food_submission.csv     
  inflating: travel_submission.csv   
  inflating: exmuslim_submission.csv  
  inflating: Fitness_submission.csv  
  inflating: science_submission.csv  
  inflating: atheism_submission.csv  
  inflating: AskReddit_submission.csv  
  inflating: Hijabis_submission.csv  
  inflating: technology_submission.csv  
  inflating: gaming_submission.csv   
  inflating: Pets_submission.csv     
  inflating: MuslimMarriage_submission.csv  
  inflating: television_submission.csv  
  inflating: hinduism_submission.csv  
  inflating: Muslim_submission.csv   
  inflating: personalfinance_submission.csv  
  inflating: sports_submission.csv   
  inflating: politics_submission.csv  
  inflating: Christianity_submission.csv  
  inflating: islam_submission.csv    


In [None]:
# OG, used for current training
files = [
    ["food", 3000],
    ["AskReddit", 5000],
    ["Christianity", 6000],
    ["atheism", 4000],
    ["hinduism", 3000],
    ["Pets", 4000],
    ["gaming", 3000],
    ["science", 2000],
    ["sports", 4000],
    ["television", 3000],
    ["personalfinance", 3000],
    ["technology", 3000],
    ["travel", 4000],
    ["Fitness", 3000]
]

In [None]:
# This is only for Non-Islamic Subreddits
sum = 0
for file in files:
    sum += file[1]
print(sum)

train_df = pd.DataFrame()
test_df = pd.DataFrame()

for file in files:
    if file[0] == "travel":
        df = pd.read_csv(f"/content/{file[0]}_submission.csv",lineterminator='\n')
    else:
        df = pd.read_csv(f"/content/{file[0]}_submission.csv")

    df2 = df.sample(n=file[1], random_state=42)
    df.drop(df2.index, inplace=True)

    df.reset_index(drop=True)
    df2.reset_index(drop=True)

    df['subreddit'] = file[0]
    df2['subreddit'] = file[0]

    train_df = pd.concat([train_df, df2], ignore_index=True)
    test_df = pd.concat([test_df, df[:10000]], ignore_index=True)


train_df.reset_index(drop=True)
train_df["class"] = 0

test_df.reset_index(drop=True)
test_df["class"] = 0

50000


In [None]:
islamic_df = pd.read_csv("/content/islam_submission.csv")
islamic_df["subreddit"] = "islam"
islamic_df["class"] = 1
islamic_df

Unnamed: 0,document,subreddit,class
0,Salam islam subreddit.\n\nI guess I've been a ...,islam,1
1,First a foreword: Every bit of the following p...,islam,1
2,"Dear islam sub-redditors,\nI am a ""cultural hi...",islam,1
3,I was watching a debate recently where it was ...,islam,1
4,I judge a religion based not on it's pretty wo...,islam,1
...,...,...,...
39995,I made a similar post earlier in r/askmuslims ...,islam,1
39996,"Salam, the title is pretty self explanatory, b...",islam,1
39997,This is more of an emotional rather than a rat...,islam,1
39998,"salaam,\n\nmy father died last year and my mom...",islam,1


In [None]:
 # This is the final training DF (90k Docs, 50k non-islamic and 40k islamic. 12k of the Non-islamic docs are religious)
final_train_df = pd.concat([islamic_df, train_df], ignore_index=True)
final_train_df

Unnamed: 0,document,subreddit,class
0,Salam islam subreddit.\n\nI guess I've been a ...,islam,1
1,First a foreword: Every bit of the following p...,islam,1
2,"Dear islam sub-redditors,\nI am a ""cultural hi...",islam,1
3,I was watching a debate recently where it was ...,islam,1
4,I judge a religion based not on it's pretty wo...,islam,1
...,...,...,...
89995,I'm not trying to be on alpha-male overload by...,Fitness,0
89996,Hey all. So I am currently a artist for the en...,Fitness,0
89997,I've been slowly recovering from a lower back ...,Fitness,0
89998,Just as the title described. I've been lifting...,Fitness,0


In [None]:
# Reading the two islamic subreddits we have (other than r/islam), creating the final test dataframe
temp1 = pd.read_csv("/content/Hijabis_submission.csv")
temp2 = pd.read_csv("/content/Muslim_submission.csv")
temp1["subreddit"] = "Hijabis"
temp2["subreddit"] = "Muslim"
islam_test_df = pd.concat([temp1, temp2], ignore_index=True)
islam_test_df["class"] = 1

final_test_df = pd.concat([test_df, islam_test_df], ignore_index=True)
final_test_df

Unnamed: 0,document,subreddit,class
0,I was just talking to an elderly Italian woman...,food,0
1,A few weeks back someone posted a chocolate ca...,food,0
2,* 1 16 oz package spaghetti noodles or angel h...,food,0
3,"I had some friends over this weekend, and we d...",food,0
4,* Serves six.\n* Prep Time—30 minutes (five wi...,food,0
...,...,...,...
109876,Abu Hurairah (RA) reported Allah's Messenger (...,Muslim,1
109877,My mother celebrates pagan holidays (solstices...,Muslim,1
109878,"ASAK, I'm (18) brought to the limit. All I'm g...",Muslim,1
109879,I came here because I wanted to read the Quran...,Muslim,1


In [None]:
# Doing some shuffling
final_train_df = final_train_df.sample(frac=1, random_state=42)
final_train_df

Unnamed: 0,document,subreddit,class
0,"Hi, just wanted to start an info thread. I've ...",Fitness,0
1,It is only 10 questions and is completely anon...,islam,1
2,Like a second-last chance? Before the 'gate' c...,Christianity,0
3,Hey guys!\n\nI just picked up my new buddy fro...,Pets,0
4,Ill start off saying that before this it had b...,gaming,0
...,...,...,...
89995,"Salaam alaikum! \nSo, I'm a recent convert fro...",islam,1
89996,I see a lot of posts around here bashing reli...,atheism,0
89997,"Compared to say, UK television shows of multip...",television,0
89998,"asalamu `alaykum wa rahmatulLahi wa barakatuh,...",islam,1


Export your CSVs

In [None]:
final_train_df.to_csv("train-raw.csv", index=False)

In [None]:
final_test_df.to_csv("test-raw.csv", index=False)

# Combining all CSVs

In [None]:
folder_path = "/content"
files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

df = pd.DataFrame()

for file in files:
    print(file)
    if file == "travel_submission.csv":
        temp_df = pd.read_csv(file,lineterminator='\n')
    else:
        temp_df = pd.read_csv(file)

    df = pd.concat([df, temp_df], ignore_index=True)

df=df.drop_duplicates().dropna().reset_index(drop=True)
df

television_submission.csv
hinduism_submission.csv
personalfinance_submission.csv
sports_submission.csv
travel_submission.csv
MuslimMarriage_submission.csv
science_submission.csv
Pets_submission.csv
exmuslim_submission.csv
atheism_submission.csv
Fitness_submission.csv
Muslim_submission.csv
Hijabis_submission.csv
food_submission.csv
shia_submission.csv
AskReddit_submission.csv
gaming_submission.csv
Christianity_submission.csv
technology_submission.csv
politics_submission.csv
islam_submission.csv


Unnamed: 0,document,subreddit
0,If the last five years of television should ha...,television
1,The title says it all. I'm a bit miffed but a...,television
2,Robert Englund played the lovable I-can't-beli...,television
3,Hope it's OK to ask here...\n\nI changed from ...,television
4,you get 3 hrs/day during the week and 1 hr on ...,television
...,...,...
589563,I made a similar post earlier in r/askmuslims ...,islam
589564,"Salam, the title is pretty self explanatory, b...",islam
589565,This is more of an emotional rather than a rat...,islam
589566,"salaam,\n\nmy father died last year and my mom...",islam


In [None]:
df.to_csv("documents-all-raw.csv", index=False)

# Code for making the final distributon for Annotation

In [None]:
import pandas as pd

df = pd.read_csv("/content/drive/MyDrive/FYP/Data/Annotation Data/Quality-Docs.csv", lineterminator="\n")
df

Unnamed: 0,Document,subreddit
0,If the last five years of television should ha...,television
1,The title says it all. I'm a bit miffed but a...,television
2,Robert Englund played the lovable I-can't-beli...,television
3,Hope it's OK to ask here...\n\nI changed from ...,television
4,you get 3 hrs/day during the week and 1 hr on ...,television
...,...,...
534543,I made a similar post earlier in r/askmuslims ...,islam
534544,"Salam, the title is pretty self explanatory, b...",islam
534545,This is more of an emotional rather than a rat...,islam
534546,"salaam,\n\nmy father died last year and my mom...",islam


In [None]:
carriage_mask = df['Document'].str.contains('\r')
travel_mask = df['subreddit'] == 'travel'

mask_to_remove = carriage_mask & travel_mask

df = df[~mask_to_remove]

In [None]:
df

Unnamed: 0,Document,subreddit
0,If the last five years of television should ha...,television
1,The title says it all. I'm a bit miffed but a...,television
2,Robert Englund played the lovable I-can't-beli...,television
3,Hope it's OK to ask here...\n\nI changed from ...,television
4,you get 3 hrs/day during the week and 1 hr on ...,television
...,...,...
534543,I made a similar post earlier in r/askmuslims ...,islam
534544,"Salam, the title is pretty self explanatory, b...",islam
534545,This is more of an emotional rather than a rat...,islam
534546,"salaam,\n\nmy father died last year and my mom...",islam


In [None]:
# total = 35000
files = [
    ["islam", 11000],
    ["Muslim", 1000],
    ["exmuslim", 2000],
    ["Hijabis", 500],
    ["MuslimMarriage", 500],
    ["food", 700],
    ["AskReddit", 600],
    ["Pets", 700],
    ["gaming", 700],
    ["science", 600],
    ["sports", 700],
    ["television", 600],
    ["personalfinance", 700],
    ["technology", 600],
    ["travel", 700],
    ["Fitness", 700],
    ["politics", 700],
    ["Christianity", 4000],
    ["atheism", 4000],
    ["hinduism", 4000],
]

In [None]:
output = pd.DataFrame()

for subreddit, sample_size in files:
    # Filter the DataFrame for the current subreddit
    subreddit_df = df[df['subreddit'] == subreddit]

    print(subreddit, len(subreddit_df), sample_size)
    # Sample 'sample_size' number of rows from the filtered DataFrame
    sampled_rows = subreddit_df.sample(n=sample_size, random_state=42)

    # Append the sampled rows to the output DataFrame
    output = pd.concat([output, sampled_rows])

output

islam 34647 11000
Muslim 1823 1000
exmuslim 38661 2000
Hijabis 3694 500
MuslimMarriage 7345 500
food 7272 700
AskReddit 46909 600
Pets 43737 700
gaming 45917 700
science 1887 600
sports 4321 700
television 21138 600
personalfinance 46886 700
technology 4951 600
travel 47821 700
Fitness 47320 700
politics 33824 700
Christianity 43350 4000
atheism 43766 4000
hinduism 6554 4000


Unnamed: 0,Document,subreddit
527941,"I was raised as a Muslim, but never got into t...",islam
534235,"\nAs salamu alaikum, I wonder what the benefit...",islam
531104,I was at a coffee shop when I saw a carbonara ...,islam
525922,"Hello, so I am a 13 year old boy, naturally I ...",islam
523640,We intend to get married soon but my father an...,islam
...,...,...
24397,"You never hear of the Lord Shiva, for example,...",hinduism
23344,**Question 1 : What is the difference between ...,hinduism
23076,"The symbol right now is associated with hate, ...",hinduism
25367,\n\nNamaste dear all \n \n\nI am really rea...,hinduism


In [None]:
output.to_csv("/content/drive/MyDrive/FYP/Data/Annotation Data/To-be-Annotated-35000.csv", index_label="index")