# Trellis Law Case Study EDA

## Import Libraries and Modules

In [3]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..')))
path = os.path.abspath(os.path.join('..'))

import pandas as pd
import re
from collections import Counter
from wordcloud import WordCloud
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from src.trellis_law.nodes.data_engineering import preprocess_data

## Load Data

In [4]:

# Load and preprocess the data
data = preprocess_data(path+'/data/01_raw/trellis_assesment_data')


Base path: /home/wilfredgp/trellis_law/data/01_raw/trellis_assesment_data
Reading directory: /home/wilfredgp/trellis_law/data/01_raw/trellis_assesment_data/food
Read 100 files from /home/wilfredgp/trellis_law/data/01_raw/trellis_assesment_data/food
food data: 100 records
Reading directory: /home/wilfredgp/trellis_law/data/01_raw/trellis_assesment_data/sport
Read 100 files from /home/wilfredgp/trellis_law/data/01_raw/trellis_assesment_data/sport
sport data: 100 records
Reading directory: /home/wilfredgp/trellis_law/data/01_raw/trellis_assesment_data/space
Read 100 files from /home/wilfredgp/trellis_law/data/01_raw/trellis_assesment_data/space
space data: 100 records
Reading directory: /home/wilfredgp/trellis_law/data/01_raw/trellis_assesment_data/medical
Read 100 files from /home/wilfredgp/trellis_law/data/01_raw/trellis_assesment_data/medical
medical data: 100 records
Reading directory: /home/wilfredgp/trellis_law/data/01_raw/trellis_assesment_data/business
Read 100 files from /home/wi

## Sentiment Analysis

## Sentiment plot

In [22]:

# Sentiment Analysis
def analyze_sentiment(text):
    return TextBlob(text).sentiment.polarity

data['sentiment'] = data['content'].apply(analyze_sentiment)

# Plot sentiment distribution by category with Plotly
fig = px.histogram(data, x='sentiment', color='category', nbins=20, title='Sentiment Distribution by Category',
                   labels={'sentiment': 'Sentiment Polarity'}, barmode='group', histnorm='percent')
fig.update_layout(yaxis_title='Percentage', xaxis_title='Sentiment Polarity')
fig.show()


### Interpretation
Overall Observations:
- Most categories show a similar sentiment distribution with peaks at positive (0.2 to 0.3) and neutral (0) sentiments.
- There are minimal negative sentiments across categories.
- This suggests that the content across categories is generally positive or neutral, with very few negative sentiments.

## Topic Modeling

### Unsupervised Topic Modeling k=10

In [12]:
# LDA Topic Modeling
vectorizer = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')
dtm = vectorizer.fit_transform(data['content'])

lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(dtm)

# Extract topics and display them in a table
def get_topics(model, vectorizer, n_top_words):
    keywords = vectorizer.get_feature_names_out()
    topics = {}
    for i, topic in enumerate(model.components_):
        topic_keywords = [keywords[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        topics[f'Topic {i+1} Keywords'] = topic_keywords
    return pd.DataFrame(topics)

topics_df = get_topics(lda, vectorizer, 5)
display(topics_df)

Unnamed: 0,Topic 1 Keywords,Topic 2 Keywords,Topic 3 Keywords,Topic 4 Keywords,Topic 5 Keywords,Topic 6 Keywords,Topic 7 Keywords,Topic 8 Keywords,Topic 9 Keywords,Topic 10 Keywords
0,said,war,minutes,design,year,said,festival,space,party,health
1,mr,german,cup,graphic,said,people,film,nasa,said,hiv
2,government,french,data,best,world,mobile,new,edu,mr,medical
3,year,germany,add,edu,olympic,like,software,earth,election,disease
4,film,british,teaspoon,article,92,use,center,orbit,kilroy,1993


### Topic Distribution by Category

In [14]:
# Add a column for dominant topic in each document
topic_values = lda.transform(dtm)
data['dominant_topic'] = topic_values.argmax(axis=1)

# Plot the distribution of topics by category
fig = px.histogram(data, x='dominant_topic', color='category', nbins=10, title='Topic Distribution by Category',
                   labels={'dominant_topic': 'Dominant Topic'}, barmode='group')
fig.update_layout(yaxis_title='Frequency', xaxis_title='Dominant Topic')
fig.show()
