In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib as plt
import seaborn as sns

In [None]:
df = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train-processed-seqlen128.csv")


In [None]:
df.head()

In [None]:
# Check for missing data
missing_data = df.isnull().sum()
print("Missing Data:\n", missing_data)

if missing_data.sum() == 0:
    print("No missing data in the dataset.")
else:
    print("There is missing data in the dataset.")

In [None]:
# Calculate message lengths based on the comment_text column
df['message_length'] = df['comment_text'].str.len()

# Plot message length vs. frequency
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.hist(df['message_length'], bins=60, color='green', alpha=0.7, label='All Comments')
plt.xlabel('Message Length')
plt.ylabel('Frequency')
plt.title('Message Length for Training Data')
plt.legend()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Calculate message lengths
df['message_length'] = df['comment_text'].str.len()

# Determine if a comment is dirty or clean
df['is_dirty'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0

# Separate clean and dirty comments
clean_comments = df[df['is_dirty'] == False]['message_length']
dirty_comments = df[df['is_dirty'] == True]['message_length']

# Plot message length vs. frequency for clean and dirty comments
plt.figure(figsize=(12, 6))
plt.hist(clean_comments, bins=100, color='blue', alpha=0.5, label='Clean Comments')
plt.hist(dirty_comments, bins=100, color='red', alpha=0.5, label='Dirty Comments')
plt.xlabel('Message Length')
plt.ylabel('Frequency')
plt.title('Message Length Distribution: Clean vs Dirty Comments')
plt.legend()
plt.xticks(ticks=range(0, 2000, 500))  # Adjusting x-axis ticks for better scaling
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Calculate the number of occurrences for each tag
tag_counts = {
    'toxic': df['toxic'].sum(),
    'severe_toxic': df['severe_toxic'].sum(),
    'obscene': df['obscene'].sum(),
    'threat': df['threat'].sum(),
    'insult': df['insult'].sum(),
    'identity_hate': df['identity_hate'].sum(),
    'clean': (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) == 0).sum()
}

# Convert the dictionary into a DataFrame for easy plotting
tag_counts_df = pd.DataFrame(list(tag_counts.items()), columns=['Type', 'Occurrences'])

# Plot the bar chart
plt.figure(figsize=(10, 6))
plt.bar(tag_counts_df['Type'], tag_counts_df['Occurrences'], color=['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink'])
plt.xlabel('Type')
plt.ylabel('Occurrences')
plt.title('Number of Tags')
# Annotating the bar values
for i, val in enumerate(tag_counts_df['Occurrences']):
    plt.text(i, val + 1000, f'{val:.1f}', ha='center', fontsize=10)
plt.show()


In [None]:
tag_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
df['num_tags'] = df[tag_columns].sum(axis=1)

# Count occurrences of each number of tags
tag_counts = df['num_tags'].value_counts().sort_index()

# Plot the bar chart
plt.figure(figsize=(10, 6))
bar_colors = plt.cm.tab20(range(len(tag_counts)))  # Optional: Colorful bars
tag_counts.plot(kind='bar', color=bar_colors)
plt.title('Number of Multiple Tags per Comment')
plt.xlabel('Number of Tags')
plt.ylabel('Occurrences')

# Annotate bar plot with numbers
for index, value in enumerate(tag_counts):
    plt.text(index, value + 500, str(value), ha='center', va='bottom', fontsize=10)

plt.show()


In [None]:
# Function to calculate the percentage of unique words in a comment
def percent_unique_words(text):
    words = text.split()  # Split comment into words
    if len(words) == 0:
        return 0
    unique_words = set(words)
    return len(unique_words) / len(words) * 100

# Add a new column for the percentage of unique words
df["percent_unique_words"] = df["comment_text"].apply(percent_unique_words)

# Split the data into dirty and clean based on the label
dirty_comments = df[df['is_dirty'] == 1]["percent_unique_words"]
clean_comments = df[df['is_dirty'] == 0]["percent_unique_words"]

# Plot the distributions using seaborn
plt.figure(figsize=(10, 6))
sns.kdeplot(dirty_comments, fill=True, color="red", label="Dirty")
sns.kdeplot(clean_comments, fill=True, color="blue", label="Clean")

# Add titles and labels
plt.title("Percentage of Unique Words of Total Words in Comments", fontsize=14)
plt.xlabel("Percent Unique Words", fontsize=12)
plt.ylabel("Number of Occurrences", fontsize=12)
plt.legend()
plt.show()


In [None]:
df["percent_unique_words"]

In [None]:
df.head()

In [None]:
import re
def remove_ip_addresses(text):
    if isinstance(text, str):
        return re.sub(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', '', text)
    return text

df['comment_text'] = df['comment_text'].apply(remove_ip_addresses)

In [None]:
!pip install langid

In [None]:
# import langid


# # Function to detect language
# def detect_language(text):
#     try:
#         return langid.classify(text)[0]
#     except Exception:
#         return 'unknown'

# # Apply language detection to the comment_text column
# df['detected_lang'] = df['comment_text'].apply(detect_language)

# # Display the dataset with detected languages
# print(df.head())

In [None]:
df_validation = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/validation.csv")

In [None]:
df_validation.head()

In [None]:
df_train_unprocessed=pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")

In [None]:
df_train_unprocessed.head()