In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from pandas import DataFrame
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import WordPunctTokenizer
from nltk.stem.isri import ISRIStemmer
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import itertools
import random
import os
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import nltk
import sklearn
from collections import Counter

In [None]:
folder_path = '/kaggle/input/hespress'  

csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

data_frames = {}

for csv_file in csv_files:
    csv_file_path = os.path.join(folder_path, csv_file)
    print(f"Contents of {csv_file}:")
    df = pd.read_csv(csv_file_path)
    df = df.iloc[:, 2:] 
    display(df)
    print("\n")

In [None]:
def preprocessing_ar(text):
    text = stemming_ar(text)
    text = stopWordRemove_ar(text)
    text = normlizeArabic_ar(text)
    return text

def stopWordRemove_ar(text):
    my_st_file = open("/kaggle/input/stopwords/allstop.txt")
    my_list = my_st_file.read()
    needed_word = []
    words = word_tokenize(text)
    for w in words:
        if w not in my_list:
            needed_word.append(w)
    filtered_sentence = ' '.join(needed_word)
    return filtered_sentence

def stemming_ar(text):
    st = ISRIStemmer()
    stemmed_words = []
    words = word_tokenize(text)
    for w in words:
        stemmed_words.append(st.stem(w))
    stemmed_sentence = ' '.join(stemmed_words)
    return stemmed_sentence

punctuations = '''`1234567890÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation

def normlizeArabic_ar(text):
    translator = str.maketrans('', '', punctuations)
    text = text.translate(translator)

    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)

    noise = re.compile("""
                             ّ    | # Shadda
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)
    text = re.sub(noise, "", text)

    return text

def prepareDataSet(df):
    if 'comment' in df.columns and 'story' in df.columns:
        print("Before preprocessing - 'comment' column:")
        print(df['comment'])
        print("\nBefore preprocessing - 'story' column:")
        print(df['story'])
        df['comment'] = df['comment'].apply(lambda x: x if isinstance(x, str) else '')
        df['story'] = df['story'].apply(lambda x: x if isinstance(x, str) else '')
        df['comment'] = df['comment'].apply(preprocessing_ar)
        df['story'] = df['story'].apply(preprocessing_ar)
    elif 'comment' in df.columns:
        print("Before preprocessing - 'comment' column:")
        print(df['comment'])
        df['comment'] = df['comment'].apply(lambda x: x if isinstance(x, str) else '')
        df['comment'] = df['comment'].apply(preprocessing_ar)
    elif 'story' in df.columns:
        print("Before preprocessing - 'story' column:")
        print(df['story'])
        df['story'] = df['story'].apply(lambda x: x if isinstance(x, str) else '')
        df['story'] = df['story'].apply(preprocessing_ar)
    else:
        print("No 'comment' or 'story' column found in the DataFrame.")
    return df

def remove_non_strings(df):
    for col in df.columns:
        if col not in ['comment', 'story']:
            continue
        df[col] = df[col].apply(lambda x: x if isinstance(x, str) else '')
    return df

folder_path = '/kaggle/input/hespress'  

csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]
preprocessed_dfs = []
for csv_file in csv_files:
    csv_file_path = os.path.join(folder_path, csv_file)
    print(f"Contents of {csv_file}:")
    df = pd.read_csv(csv_file_path)
    df = df.iloc[:, 2:] 
    df = prepareDataSet(df)
    df_cleaned = remove_non_strings(df)
    preprocessed_dfs.append(df_cleaned)
    display(df_cleaned)
    print("\n")

In [None]:
def calculate_and_visualize_eda_stories(df, story_col='story', topic_col='topic', author_col='author'):
    # Check if the DataFrame contains required columns
    if story_col in df.columns and topic_col in df.columns:
        # Apply EDA specific to 'stories' DataFrame
        print("EDA for Stories DataFrame:")

        # Number of examples per topic
        examples_per_topic = df[topic_col].value_counts()

        # Top frequent n-grams
        n = 2  
        all_texts = ' '.join(df[story_col])
        all_ngrams = Counter(zip(*[all_texts[i:] for i in range(n)]))
        top_ngrams_overall = all_ngrams.most_common(10)

        class_ngrams = {}
        for topic_name, topic_group in df.groupby(topic_col):
            class_texts = ' '.join(topic_group[story_col])
            class_ngrams[topic_name] = Counter(zip(*[class_texts[i:] for i in range(n)]))
            class_top_ngrams = class_ngrams[topic_name].most_common(10)

            # Print the top 10 most common n-grams per topic
            print(f"\nTop Frequent {n}-grams for {topic_name}:")
            for ngram, count in class_top_ngrams:
                print(f"{ngram} - Count: {count}")

        # Lengths of examples in words and letters
        df['word_count'] = df[story_col].apply(lambda x: len(x.split()))
        df['letter_count'] = df[story_col].apply(lambda x: len(x.replace(' ', '')))

        # Insights and Visualizations
        print("Number of Examples per Topic:")
        print(examples_per_topic)

        print("\nTop Frequent {n}-grams Overall:")
        for ngram, count in top_ngrams_overall:
            print(f"{ngram} - Count: {count}")

        # Bar plot: Number of examples per topic
        plt.figure(figsize=(8, 5))
        sns.barplot(x=examples_per_topic.index, y=examples_per_topic.values)
        plt.xlabel('Topic')
        plt.ylabel('Number of Examples')
        plt.title('Number of Examples per Topic (Stories)')
        plt.xticks(rotation=45)
        plt.show()

        # Word count distribution
        plt.figure(figsize=(8, 5))
        sns.histplot(df['word_count'], kde=True)
        plt.xlabel('Word Count')
        plt.ylabel('Frequency')
        plt.title('Distribution of Word Count (Stories)')
        plt.show()

        # Letter count distribution
        plt.figure(figsize=(8, 5))
        sns.histplot(df['letter_count'], kde=True)
        plt.xlabel('Letter Count')
        plt.ylabel('Frequency')
        plt.title('Distribution of Letter Count (Stories)')
        plt.show()
        # Author Analysis (for stories dataset)
        top_authors_stories = df[author_col].value_counts().nlargest(5)
        print("\nAuthor Analysis (Top 5 Authors - Stories):")
        print(top_authors_stories)
for df in preprocessed_dfs:
    if 'story' in df.columns and 'topic' in df.columns:
        calculate_and_visualize_eda_stories(df)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud

def calculate_and_visualize_eda_comments(df, comment_col='comment', topic_col='topic', score_col='score'):
    # Check if the DataFrame contains required columns
    if comment_col in df.columns and topic_col in df.columns and score_col in df.columns:
        # Apply EDA specific to 'comments' DataFrame
        print(f"EDA for Comments DataFrame (CSV File: {df.name}):")

        # Number of examples per topic
        examples_per_topic_comments = df[topic_col].value_counts()

        # Convert 'score' column to numeric
        df['score'] = pd.to_numeric(df['score'], errors='coerce')

        # Sentiment Analysis (for comments dataset)
        positive_comments = df[df['score'] >= 0]
        negative_comments = df[df['score'] < 0]
        sentiment_distribution_comments = pd.DataFrame({
            'Sentiment': ['Positive', 'Negative'],
            'Count': [len(positive_comments), len(negative_comments)]
        })
        print("\nSentiment Analysis (Positive and Negative Comments):")
        print(sentiment_distribution_comments)

        
for i, df in enumerate(preprocessed_dfs):
    if 'comment' in df.columns:
        
        df.name = f"DataFrame_{i}"
        calculate_and_visualize_eda_comments(df)