## Porządkowanie danych + EDA

## Abstrakt projektu

## Library import

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
import langid

## Data

In [3]:
still = pd.read_csv("/Users/tomaszkozubal/Documents/Nauka/Data Science - podyplomówka/DS_project_data/processed_data/df_still.csv")
balluf = pd.read_csv("/Users/tomaszkozubal/Documents/Nauka/Data Science - podyplomówka/DS_project_data/processed_data/df_balluf.csv")
orange = pd.read_csv("/Users/tomaszkozubal/Documents/Nauka/Data Science - podyplomówka/DS_project_data/processed_data/df_orange.csv")
capgemini = pd.read_csv("/Users/tomaszkozubal/Documents/Nauka/Data Science - podyplomówka/DS_project_data/processed_data/df_capgemini.csv")
pmi = pd.read_csv("/Users/tomaszkozubal/Documents/Nauka/Data Science - podyplomówka/DS_project_data/processed_data/df_pmi.csv")
bosch = pd.read_csv("/Users/tomaszkozubal/Documents/Nauka/Data Science - podyplomówka/DS_project_data/processed_data/df_bosch.csv")

In [4]:
dfs = {
    "still": still,
    "balluf": balluf,
    "orange": orange,
    "pmi": pmi,
    "bosch": bosch
}

In [5]:
for name, df in dfs.items():
    print(f"{name}: {df.shape}")

still: (1329, 17)
balluf: (4892, 17)
orange: (5655, 17)
pmi: (4456, 17)
bosch: (9798, 17)


## Clearing dataframes

In [8]:
class DataProcessor:

    def __init__(self, df: pd.DataFrame):
        self.df = df
        self.df_lang = None
        self.df_pl = None
        self.df_eng = None

    #choose specific columns
    def clean(self):
        required_cols = ["comment", "published_at", "details_type", "likes", "comments", 
                         "estimated_covearage", "user_network_at_stats_check_date", "language"]
        self.df = self.df = self.df[required_cols]
        self.df = self.df.dropna(subset=["estimated_covearage"])

    #look for NaNs and update language using the langid library
    def lang_processing(self):
    
        for index, row in self.df.iterrows():
            
            if row["language"] in ["pl", "en"]:
                continue
            
            elif pd.isna(row['language']):
                lang, _ = langid.classify(row['comment'])
                self.df.at[index, 'language'] = lang
    
        self.df = self.df[self.df["language"].isin(["en"])]

    #change date type 
    def time_change(self): 
        self.df['published_at'] = pd.to_datetime(self.df['published_at']).astype('int64') // 10**9

    #reset index
    def reset(self):
        self.df.reset_index(drop=True, inplace=True)
    
    def process_all(self):
        self.clean()  
        self.lang_processing()
        self.time_change()
        self.reset()
        
        return self.df

In [9]:
processed_dfs = {}

for name, df in dfs.items():
    processor = DataProcessor(df)
    processed_df = processor.process_all()
    processed_dfs[name] = processed_df

In [10]:
for name, df in processed_dfs.items():
    print(f"{name}:")
    print(df["language"].value_counts())

still:
language
en    3
Name: count, dtype: int64
balluf:
language
en    2753
Name: count, dtype: int64
orange:
language
en    65
Name: count, dtype: int64
pmi:
language
en    2536
Name: count, dtype: int64
bosch:
language
en    3901
Name: count, dtype: int64


### Choosing dataframes which are "en" abundant

In [11]:
selected_keys = ["pmi", "bosch", "balluf"]
en_clients = {key: processed_dfs[key] for key in selected_keys}

In [12]:
for name, df in en_clients.items():
    print(f"{name}: {df.shape}")

pmi: (2536, 8)
bosch: (3901, 8)
balluf: (2753, 8)


### Final check if there are any empty or NaN cells 

In [13]:
for name, df in en_clients.items():
    mask = df.isna() | (df == "")
    rows_with_empty_or_nan = mask.any(axis=1)
    count = rows_with_empty_or_nan.sum()
    print(f"{name}: {count} rows have NaN or empty string")

pmi: 0 rows have NaN or empty string
bosch: 0 rows have NaN or empty string
balluf: 0 rows have NaN or empty string


In [14]:
en_clients["pmi"]

Unnamed: 0,comment,published_at,details_type,likes,comments,estimated_covearage,user_network_at_stats_check_date,language
0,Imagine a workplace where nothing is urgent......,1706082416,NATIVE_GALLERY,56.0,12.0,2615.0,1758.0,en
1,Valentine's Day confession...🧡I absolutely lov...,1707893450,NATIVE_GALLERY,82.0,5.0,2643.0,982.0,en
2,"If you know “Co mi zrobisz jak mnie złapiesz”,...",1710949648,LINK,1.0,0.0,255.0,658.0,en
3,Hey there! I just wanted to share some excitin...,1704894218,LINK,31.0,2.0,1144.0,769.0,en
4,➡️ Navigating challenges is easier with a stro...,1704355230,NATIVE_FILE,7.0,0.0,655.0,1373.0,en
...,...,...,...,...,...,...,...,...
2531,Can one employee be worth more than several ot...,1737716332,LINK,1.0,0.0,245.0,629.0,en
2532,"What a night it was! Last Friday, on January...",1737716392,VIDEO,2.0,0.0,270.0,629.0,en
2533,"Whether it’s about boosting your career, gaini...",1737716431,LINK,1.0,0.0,245.0,629.0,en
2534,Helósztok! Hogy vagy? 🙂 Did you know how to a...,1737716814,NATIVE_FILE,13.0,0.0,915.0,1687.0,en


In [15]:
en_clients["pmi"].to_csv('pmi_raw.csv', index=True) 