# Necessary library

In [None]:
%pip install nepali-to-roman
%pip install langdetect
%pip install pandas

# Input and Output paths

In [None]:
# load the to-be-tested dataset
to_be_tested_dataset_input_path="../Facebook Datas/dataset_facebook-comments-scraper_2024-05-03_16-07-42-569.csv"
filtered_dataset_output_path="../Profanity and Gender filtered Datas/dataset_facebook-comments-scraper_2024-05-03_16-07-42-569.csv"

# Keyword Gathering


### Manual Tagging Profanity Keywords

In [None]:
import pandas as pd
manual_tagging_df = pd.read_csv("./filterwordlist.csv")
manual_tagging_df

In [None]:
# Checking the number of unique value in Profanity column
manual_tagging_df["Profanity"].value_counts()

In [None]:
# Ensure the 'Profanity' column is treated as strings
manual_tagging_df['Profanity'] = manual_tagging_df['Profanity'].astype(str)

# Filter the manual_tagging_df DataFrame to get only the rows where the Profanity is '1'
manual_profanity_df = manual_tagging_df[manual_tagging_df["Profanity"] == '1']

# Reset the index of the manual_profanity_df DataFrame and drop the old index (optional)
manual_profanity_df = manual_profanity_df.reset_index(drop=True)

# The manual_profanity_df DataFrame now contains only the rows where the Profanity is '1', with a new index
manual_profanity_df

In [None]:
# Initialize an empty set for profanity words
profnaity_word_set = set()

# Update the profanity word set with the RawNep, RawRom, NormNep, and NormRom columns from the manual_profanity_df DataFrame
profnaity_word_set.update(
    set(manual_profanity_df["RawNep"]),
    set(manual_profanity_df["RawRom"]),
    set(manual_profanity_df["NormNep"]),
    set(manual_profanity_df["NormRom"])
)

# The profnaity_word_set set now contains unique profanity words from the RawNep, RawRom, NormNep, and NormRom columns of the manual_profanity_df DataFrame
profnaity_word_set

### NepSA Profanity Keywords

In [None]:
# Read the NepSA.csv file into a pandas DataFrame
NepSA_df = pd.read_csv("./NepSA.csv", header=None)

# Rename the columns of the DataFrame
NepSA_df = NepSA_df.rename(columns={1: "Type", 2: "Keyword", 3: "Data"})

# Drop the first column of the DataFrame
NepSA_df = NepSA_df.drop(columns=[0])

# The NepSA_df DataFrame now contains the data from the csv file with the specified column names
NepSA_df

In [None]:
# Checking the number of unique value in Type column
NepSA_df["Type"].value_counts()

In [None]:
# Filter the NepSA_df to get only the rows where the Type is "PROFANITY"
NepSA_profanity_df = NepSA_df[NepSA_df["Type"]=="PROFANITY"]

# Create a set of unique profanity keywords from the NepSA_profanity_df
NepSaprofane = set(NepSA_profanity_df["Keyword"])
NepSaprofane

In [None]:
# Update the profanity word set with the NepSaprofane set
profnaity_word_set.update(NepSaprofane)

# Convert the updated profanity word set to a list
profnaity_word_list = list(profnaity_word_set)
profnaity_word_set

In [None]:
# Calculating the total number of profanity words
len(profnaity_word_list)

# Profanity filtering

In [None]:

import pandas as pd

# Sentences filtering uidng above keywords
def profanityfilter(df, column_name, filterwords):
    # Create a regex pattern from the filterwords | acts as 'OR' for regex
    pattern = '|'.join(filterwords)

    # Filter the DataFrame using the regex pattern and specified column
    selected_df = df[df[column_name].str.contains(pattern, case=True)]

    # Create an empty list to store the results
    results = []

    # Iterate over the rows of the selected DataFrame
    for index, row in selected_df.iterrows():
        # Iterate over each filterword
        for filterword in filterwords:
            # Check if the filterword is present in the row's data
            if filterword.lower() in row[column_name].lower():
                # Append the row's data and the corresponding filterword to the results list
                results.append({column_name: row[column_name], 'Filterword': filterword})
                break  # Break out of the inner loop since we found a match

    # Convert the results list to a DataFrame
    results_df = pd.DataFrame(results)

    return selected_df, results_df


#### Importing to-be-tested Dataset

In [None]:
to_be_tested_dataset=pd.read_csv(to_be_tested_dataset_input_path)
to_be_tested_dataset.head()

#### Applying the filteration function

In [None]:
filtered_df,result_df= profanityfilter(to_be_tested_dataset,"text",profnaity_word_list)
filtered_df.info()

# Gender Detection

#### Importing files

In [None]:
import pandas as pd

# Load boys' names and girls' names from Excel files
boys_names_df = pd.read_csv("./Boys Names Nepali + Indian - Names.csv")
girls_names_df = pd.read_csv("./Girl Name Nepali + Indian - Sheet1.csv")
voter_girls_names_df = pd.read_csv("voter_dataset_female.csv")
voter_boys_names_df = pd.read_csv("voter_dataset_male.csv")

#### Labeling, removing duplicate, null values for boys and girls

In [None]:
# Extract only the "Name" column and remove rows with missing values
boys_names_df = boys_names_df[["Name"]]
boys_names_df = boys_names_df.dropna()

# Assign a gender label (1 for boys)
boys_names_df = boys_names_df.assign(Gender=1)

# Split names separated by ';' or '/' and explode into multiple rows
boys_names_df['Name'] = boys_names_df['Name'].str.replace(';', ',').replace('/', ',').str.split(',')
boys_names_df = boys_names_df.explode('Name')

# Assign a gender label (1 for boys)
boys_names_df = boys_names_df.assign(Gender=1)
boys_names_df.head()

In [None]:
# For girls' names
girls_names_df = girls_names_df[["Name"]]
girls_names_df = girls_names_df.dropna()

# Split names separated by ';' or '/' and explode into multiple rows
girls_names_df['Name'] = girls_names_df['Name'].str.replace(';', ',').replace('/', ',').str.split(',')
girls_names_df = girls_names_df.explode('Name')

# Assign a gender label (0 for girls)
girls_names_df = girls_names_df.assign(Gender=0)
girls_names_df.head()

In [None]:
voter_boys_names_df.rename(columns={'First_Name': 'Name'}, inplace=True)
voter_boys_names_df = voter_boys_names_df[["Name","Gender"]]
voter_boys_names_df

In [None]:
voter_girls_names_df.rename(columns={'First_Name': 'Name'}, inplace=True)
voter_girls_names_df = voter_girls_names_df[["Name","Gender"]]
voter_girls_names_df

#### Combining all the names in single dataframe

In [None]:
# Concatenate boys' and girls' names into one DataFrame
combined_names_df = pd.concat([boys_names_df, girls_names_df,voter_boys_names_df,voter_girls_names_df], ignore_index=True)

# Remove rows with missing values (NaN) in the "Name" column
combined_names_df = combined_names_df.dropna(subset=['Name'])

# Remove duplicate rows based on the "Name" column
combined_names_df = combined_names_df.drop_duplicates(subset=['Name'], keep='first')

# Reset index
combined_names_df = combined_names_df.reset_index(drop=True)

# Display the combined DataFrame
combined_names_df

#### Conversion of Nepali name into English

In [None]:
# %pip install nepali-to-roman
# %pip install langdetect
from langdetect import detect
import re
import ntr

def capitalize_after_space(name):
    words = name.split()
    capitalized_words = [word.capitalize() for word in words]
    return ' '.join(capitalized_words)

def detect_nep_and_coversion(text):
      # Ensure that text is a string or convert it to a string if it's not
  if not isinstance(text, str):
      text = str(text)
  sentences = re.split(r'(?<=[.!?])\s+(?=\D)', text)
  english_comments = []
  for sentence in sentences:
      try:
          language = detect(sentence)
          if language == "ne":
            sentence=capitalize_after_space(ntr.nep_to_rom(sentence))
            english_comments.append(sentence)
          else:
            english_comments.append(sentence)
      except:
          pass
  filtered_comment = '.'.join(english_comments)
  return filtered_comment

In [None]:
filtered_df["English_Name"] = filtered_df["profileName"].apply(detect_nep_and_coversion)
filtered_df.head()

#### Detection of Gender using Name

In [None]:
# Function to get gender based on name input
def get_gender_from_name(name):
    if ' ' in name:
        name = name.split(maxsplit=1)[0].capitalize()
    else:
        name = name.capitalize()
    gender = combined_names_df[combined_names_df['Name'] == name]['Gender'].values
    if len(gender) > 0:
        return gender[0]
    else:
        return 'Unknown'

In [None]:
filtered_df["Gender"]=filtered_df["English_Name"].apply(get_gender_from_name)
filtered_df[["English Coversion","NameDetection"]].head()

#### Analysis of Gender Data

In [None]:
filtered_df["NameDetection"].value_counts()

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
filtered_df.groupby('NameDetection').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
# Assuming "dataset_facebook_comments" is your DataFrame
unknown_names_df = filtered_df[filtered_df["NameDetection"] == "Unknown"]

# Selecting only the "profileName" and "NameDetection" columns
unknown_names_df = unknown_names_df[["profileName", "NameDetection"]]

print(unknown_names_df["NameDetection"].value_counts())

# Displaying the filtered DataFrame
unknown_names_df

In [None]:
filtered_df

# Saving the final Output

In [None]:
import os
os.makedirs(os.path.dirname(filtered_dataset_output_path), exist_ok=True)
filtered_df.to_csv(filtered_dataset_output_path)