### Data Cleaning and Transformation

1. Import necessary packages

In [1]:
import pandas as pd
import os,sys

In [2]:
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))
from data_cleaning import DataFrameCleaner

2. Lead json data, convert to dataframe and merge

In [3]:
json_folder = "../data/raw"
json_files = [f for f in os.listdir(json_folder) if f.endswith(".json")]

In [4]:
# Initialize an empty list to store DataFrames
dataframes = []

# Loop through each JSON file, read it, and append it to the list
for file in json_files:
    file_path = os.path.join(json_folder, file)
    # Read the JSON file into a DataFrame
    df = pd.read_json(file_path, orient='records', encoding='utf-8')

    # Append the DataFrame to the list
    dataframes.append(df)

# Merge all DataFrames into a single DataFrame
data = pd.concat(dataframes, ignore_index=True)

In [5]:
data.head()

Unnamed: 0,id,name,sender,timestamp,text,media
0,97,CheMed,-1001627056354,2023-02-10 12:23:06+00:00,"⚠️Notice!\nDear esteemed customers,\nDue to fo...",./downloads\photo_2023-02-10_12-23-06.jpg
1,96,CheMed,-1001627056354,2023-02-02 08:58:52+00:00,Mela-One በውስጡ ሆርሞን ያለው ድንገተኛ ወሊድ መቆጣጠርያ ሲሆን ያለ...,./downloads\photo_2023-02-02_08-58-52.jpg
2,95,CheMed,-1001627056354,2023-02-01 08:59:37+00:00,አዚትሮማይሲን በሃኪም መድሃኒት ማዘዣ ከሚታዘዙ አንቲባዮቲኮች አንዱ ሲሆን...,./downloads\photo_2023-02-01_08-59-37.jpg
3,94,CheMed,-1001627056354,2023-01-31 09:19:53+00:00,Che-Med Trivia #3\n\nምግብና መጠጦች አንዳንድ መድሃኒቶች በደ...,./downloads\photo_2023-01-31_09-19-53.jpg
4,93,CheMed,-1001627056354,2023-01-30 09:45:25+00:00,"Che-Med Trivia #2\n\nእንደ Ciprofloxacin, Doxycy...",./downloads\photo_2023-01-30_09-45-25.jpg


2. Initialize DataFrameCleaner class with data

In [6]:
cleaner = DataFrameCleaner(data)

3. Replace null value with no [column name]

In [7]:
data = cleaner.clean_null_values()

In [8]:
data.tail(15)

Unnamed: 0,id,name,sender,timestamp,text,media
861,1055,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-18 07:43:58+00:00,Weekend with Yetena Weg! 🌟\n\nJoin us for two ...,./downloads\photo_2024-05-18_07-43-58.jpg
862,1054,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-17 13:58:27+00:00,no text,./downloads\photo_2024-05-17_13-58-27.jpg
863,1053,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-17 13:58:26+00:00,no text,./downloads\photo_2024-05-17_13-58-26.jpg
864,1052,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-17 13:58:26+00:00,no text,./downloads\photo_2024-05-17_13-58-26 (1).jpg
865,1051,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-17 13:58:25+00:00,no text,./downloads\photo_2024-05-17_13-58-25.jpg
866,1050,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-17 13:58:25+00:00,no text,./downloads\photo_2024-05-17_13-58-25 (1).jpg
867,1049,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-17 13:58:24+00:00,የደም ግፊት መጨመር እና የጤና መዘዞቹ!\n\n✍✍ የደም ግፊት መጨመር በ...,./downloads\photo_2024-05-17_13-58-24.jpg
868,1042,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-16 04:40:50+00:00,no text,./downloads\photo_2024-05-16_04-40-50.jpg
869,1041,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-16 04:40:50+00:00,no text,./downloads\photo_2024-05-16_04-40-50 (1).jpg
870,1040,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-16 04:40:49+00:00,no text,./downloads\photo_2024-05-16_04-40-49.jpg


4. Remove new line, Extra space and create an emoji column from text

In [9]:
data = cleaner.clean_text()

5. Create youtube, website and phone column and fills the value by extracting from text column

In [10]:
data = cleaner.extract_links()

In [11]:
data

Unnamed: 0,id,name,sender,timestamp,text,media,emoji,youtube,website,phone
0,97,CheMed,-1001627056354,2023-02-10 12:23:06+00:00,"️Notice! Dear esteemed customers, Due to four-...",./downloads\photo_2023-02-10_12-23-06.jpg,⚠🔅🔅,no youtube,no website,no phone
1,96,CheMed,-1001627056354,2023-02-02 08:58:52+00:00,Mela-One በውስጡ ሆርሞን ያለው ድንገተኛ ወሊድ መቆጣጠርያ ሲሆን ያለ...,./downloads\photo_2023-02-02_08-58-52.jpg,📌,no youtube,[www.chemeds.org],no phone
2,95,CheMed,-1001627056354,2023-02-01 08:59:37+00:00,አዚትሮማይሲን በሃኪም መድሃኒት ማዘዣ ከሚታዘዙ አንቲባዮቲኮች አንዱ ሲሆን...,./downloads\photo_2023-02-01_08-59-37.jpg,📌,no youtube,[www.chemeds.org],no phone
3,94,CheMed,-1001627056354,2023-01-31 09:19:53+00:00,Che-Med Trivia #3 ምግብና መጠጦች አንዳንድ መድሃኒቶች በደንብ...,./downloads\photo_2023-01-31_09-19-53.jpg,,no youtube,no website,no phone
4,93,CheMed,-1001627056354,2023-01-30 09:45:25+00:00,"Che-Med Trivia #2 እንደ Ciprofloxacin, Doxycycl...",./downloads\photo_2023-01-30_09-45-25.jpg,,no youtube,no website,no phone
...,...,...,...,...,...,...,...,...,...,...
871,1039,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-16 04:40:48+00:00,የጤና ወግ የዓለም አቀፍ የተደራሽነት ቀንን ስለ ቴክኖሎጂና ዲጂታል ተደራ...,./downloads\photo_2024-05-16_04-40-48.jpg,,no youtube,no website,no phone
872,1038,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-14 11:05:55+00:00,no text,./downloads\photo_2024-05-14_11-05-55.jpg,,no youtube,no website,no phone
873,1037,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-14 11:05:55+00:00,በዚህ እሁድ በቴሌግራም ቀጥታ ስርጭት ትኩሳት ያለው ልጅ በሚል ርዕስ በም...,./downloads\photo_2024-05-14_11-05-55 (1).jpg,🔥♨🗣👉🗓⏰💥👇👇👇👇👇👇👇👇👉,no youtube,"[https://t.me/yetenaweg?livestream, https://t....",no phone
874,1036,የጤና ወግ - የጤና መረጃ,-1001447066276,2024-05-13 21:57:37+00:00,no text,no media,,no youtube,no website,no phone


In [12]:
data.info

<bound method DataFrame.info of        id              name         sender                 timestamp  \
0      97            CheMed -1001627056354 2023-02-10 12:23:06+00:00   
1      96            CheMed -1001627056354 2023-02-02 08:58:52+00:00   
2      95            CheMed -1001627056354 2023-02-01 08:59:37+00:00   
3      94            CheMed -1001627056354 2023-01-31 09:19:53+00:00   
4      93            CheMed -1001627056354 2023-01-30 09:45:25+00:00   
..    ...               ...            ...                       ...   
871  1039  የጤና ወግ - የጤና መረጃ -1001447066276 2024-05-16 04:40:48+00:00   
872  1038  የጤና ወግ - የጤና መረጃ -1001447066276 2024-05-14 11:05:55+00:00   
873  1037  የጤና ወግ - የጤና መረጃ -1001447066276 2024-05-14 11:05:55+00:00   
874  1036  የጤና ወግ - የጤና መረጃ -1001447066276 2024-05-13 21:57:37+00:00   
875  1035  የጤና ወግ - የጤና መረጃ -1001447066276 2024-05-13 21:56:24+00:00   

                                                  text  \
0    ️Notice! Dear esteemed customers, Due to

6. Remove duplicates

In [13]:
data = cleaner.remove_duplicates()

  self.df = self.df.applymap(lambda x: tuple(x) if isinstance(x, list) else x)


In [14]:
data.head()

Unnamed: 0,id,name,sender,timestamp,text,media,emoji,youtube,website,phone
0,97,CheMed,-1001627056354,2023-02-10 12:23:06+00:00,"️Notice! Dear esteemed customers, Due to four-...",./downloads\photo_2023-02-10_12-23-06.jpg,⚠🔅🔅,no youtube,no website,no phone
1,96,CheMed,-1001627056354,2023-02-02 08:58:52+00:00,Mela-One በውስጡ ሆርሞን ያለው ድንገተኛ ወሊድ መቆጣጠርያ ሲሆን ያለ...,./downloads\photo_2023-02-02_08-58-52.jpg,📌,no youtube,"(www.chemeds.org,)",no phone
2,95,CheMed,-1001627056354,2023-02-01 08:59:37+00:00,አዚትሮማይሲን በሃኪም መድሃኒት ማዘዣ ከሚታዘዙ አንቲባዮቲኮች አንዱ ሲሆን...,./downloads\photo_2023-02-01_08-59-37.jpg,📌,no youtube,"(www.chemeds.org,)",no phone
3,94,CheMed,-1001627056354,2023-01-31 09:19:53+00:00,Che-Med Trivia #3 ምግብና መጠጦች አንዳንድ መድሃኒቶች በደንብ...,./downloads\photo_2023-01-31_09-19-53.jpg,,no youtube,no website,no phone
4,93,CheMed,-1001627056354,2023-01-30 09:45:25+00:00,"Che-Med Trivia #2 እንደ Ciprofloxacin, Doxycycl...",./downloads\photo_2023-01-30_09-45-25.jpg,,no youtube,no website,no phone


In [15]:
data = cleaner.clean_null_values()
# convert message date to proper format
data = cleaner.convert_timestamp('timestamp')

In [16]:
data.head(3)

Unnamed: 0,id,name,sender,timestamp,text,media,emoji,youtube,website,phone
0,97,CheMed,-1001627056354,2023-02-10 12:23:06+00:00,"️Notice! Dear esteemed customers, Due to four-...",./downloads\photo_2023-02-10_12-23-06.jpg,⚠🔅🔅,no youtube,no website,no phone
1,96,CheMed,-1001627056354,2023-02-02 08:58:52+00:00,Mela-One በውስጡ ሆርሞን ያለው ድንገተኛ ወሊድ መቆጣጠርያ ሲሆን ያለ...,./downloads\photo_2023-02-02_08-58-52.jpg,📌,no youtube,"(www.chemeds.org,)",no phone
2,95,CheMed,-1001627056354,2023-02-01 08:59:37+00:00,አዚትሮማይሲን በሃኪም መድሃኒት ማዘዣ ከሚታዘዙ አንቲባዮቲኮች አንዱ ሲሆን...,./downloads\photo_2023-02-01_08-59-37.jpg,📌,no youtube,"(www.chemeds.org,)",no phone


In [17]:
data = data.rename(columns={
    "id": "message_id",
    "name": "channel_title",
    "sender": "sender",
    "timestamp": "message_date",
    "text": "message",
    "media": "media_path",
    "emoji": "emoji",
    "youtube": "youtube",
    "website": "website",
    "phone": "phone"
})


In [18]:
data.head()

Unnamed: 0,message_id,channel_title,sender,message_date,message,media_path,emoji,youtube,website,phone
0,97,CheMed,-1001627056354,2023-02-10 12:23:06+00:00,"️Notice! Dear esteemed customers, Due to four-...",./downloads\photo_2023-02-10_12-23-06.jpg,⚠🔅🔅,no youtube,no website,no phone
1,96,CheMed,-1001627056354,2023-02-02 08:58:52+00:00,Mela-One በውስጡ ሆርሞን ያለው ድንገተኛ ወሊድ መቆጣጠርያ ሲሆን ያለ...,./downloads\photo_2023-02-02_08-58-52.jpg,📌,no youtube,"(www.chemeds.org,)",no phone
2,95,CheMed,-1001627056354,2023-02-01 08:59:37+00:00,አዚትሮማይሲን በሃኪም መድሃኒት ማዘዣ ከሚታዘዙ አንቲባዮቲኮች አንዱ ሲሆን...,./downloads\photo_2023-02-01_08-59-37.jpg,📌,no youtube,"(www.chemeds.org,)",no phone
3,94,CheMed,-1001627056354,2023-01-31 09:19:53+00:00,Che-Med Trivia #3 ምግብና መጠጦች አንዳንድ መድሃኒቶች በደንብ...,./downloads\photo_2023-01-31_09-19-53.jpg,no emoji,no youtube,no website,no phone
4,93,CheMed,-1001627056354,2023-01-30 09:45:25+00:00,"Che-Med Trivia #2 እንደ Ciprofloxacin, Doxycycl...",./downloads\photo_2023-01-30_09-45-25.jpg,no emoji,no youtube,no website,no phone


In [19]:
data.to_csv("../data/cleaned_data.csv", index = False)