In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# ls drive/MyDrive/'Final_Project_ICH'/'CSV_file'/

In [3]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt

# Очистка данных в таблице **Spend**
- преобразование дат в формат datetime
- проверка дубликатов и их удаление
- удаление пустых(не релевантных) столбцов
- просмотр уникальных значений
- проверка значений NaN
- замена типа данных на категориальные Campaign, AdGroup, Ad
- контроль данных и значений

In [4]:
spend_df = pd.read_csv('drive/MyDrive/Final_Project_ICH/CSV_file/Spend.csv')
spend_df.info()
spend_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20779 entries, 0 to 20778
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Date         20779 non-null  object
 1   Source       20779 non-null  object
 2   Campaign     14785 non-null  object
 3   Impressions  20779 non-null  int64 
 4   Spend        20779 non-null  object
 5   Clicks       20779 non-null  int64 
 6   AdGroup      13951 non-null  object
 7   Ad           13951 non-null  object
dtypes: int64(2), object(6)
memory usage: 1.3+ MB


Unnamed: 0,Date,Source,Campaign,Impressions,Spend,Clicks,AdGroup,Ad
0,2023-07-03,Google Ads,gen_analyst_DE,6,0.00€,0,,
1,2023-07-03,Google Ads,performancemax_eng_DE,4,0.01€,1,,
2,2023-07-03,Facebook Ads,,0,0.00€,0,,
3,2023-07-03,Google Ads,,0,0.00€,0,,
4,2023-07-03,CRM,,0,0.00€,0,,


In [5]:
# Преобразование формата дат
spend_df["Date"] = pd.to_datetime(spend_df["Date"], format="%Y-%m-%d", errors='coerce')

In [6]:
spend_df[spend_df['Source'] == 'Test']

Unnamed: 0,Date,Source,Campaign,Impressions,Spend,Clicks,AdGroup,Ad
7410,2023-12-01,Test,,0,0.00€,0,,
7469,2023-12-02,Test,,0,0.00€,0,,
7507,2023-12-03,Test,,0,0.00€,0,,
7538,2023-12-04,Test,,0,0.00€,0,,
7573,2023-12-05,Test,,0,0.00€,0,,
...,...,...,...,...,...,...,...,...
18948,2024-05-27,Test,,0,0.00€,0,,
19039,2024-05-28,Test,,0,0.00€,0,,
19120,2024-05-29,Test,,0,0.00€,0,,
19168,2024-05-30,Test,,0,0.00€,0,,


In [7]:
# удвляю все строки 'Test'
spend_df = spend_df[spend_df['Source'] != 'Test']

In [8]:
spend_df[spend_df['Source'] == 'Test']

Unnamed: 0,Date,Source,Campaign,Impressions,Spend,Clicks,AdGroup,Ad


In [9]:
spend_df.nunique()

Unnamed: 0,0
Date,355
Source,13
Campaign,50
Impressions,3996
Spend,2854
Clicks,552
AdGroup,20
Ad,165


In [10]:
spend_df["AdGroup"].unique()

array([nan, 'women', 'wide', 'interest_programming', 'recentlymoved',
       'interest_dataanalytics', 'interest_work',
       'interest_programming – Copy', 'interest_dataanalytics – Copy',
       'LAL1', 'b', 'Com_july_1', 'interest_all', 'Com_august',
       'interest_work_WebDev', 'interest_programming_WebDev',
       'promoposts_b', 'retargeting', 'interest_python-developer',
       'berlin_wide', 'Com_march'], dtype=object)

In [11]:
def convert_currency_to_float(value):
    """Функция преобразует строку с валютой в число, убирая символы '$' и '€'."""
    if pd.isna(value) or not isinstance(value, str):
        return np.nan  # Возвращаем NaN, если значение отсутствует или не является строкой

    try:
        # Убираем символ валюты и возможные разделители тысяч (',') и заменяем ',' или '.' на '.' для корректного преобразования
        clean_value = value.replace('$', '').replace('€', '').replace(',', '').strip()
        return float(clean_value)
    except ValueError:
        return np.nan  # Возвращаем NaN в случае ошибки преобразования

# Применяем функцию к нужным колонкам
spend_df['Spend'] = spend_df['Spend'].apply(convert_currency_to_float)

# Проверяем результат
spend_df[['Spend']].head()

Unnamed: 0,Spend
0,0.0
1,0.01
2,0.0
3,0.0
4,0.0


In [12]:
spend_df.head()

Unnamed: 0,Date,Source,Campaign,Impressions,Spend,Clicks,AdGroup,Ad
0,2023-07-03,Google Ads,gen_analyst_DE,6,0.0,0,,
1,2023-07-03,Google Ads,performancemax_eng_DE,4,0.01,1,,
2,2023-07-03,Facebook Ads,,0,0.0,0,,
3,2023-07-03,Google Ads,,0,0.0,0,,
4,2023-07-03,CRM,,0,0.0,0,,


In [13]:
spend_df[spend_df.duplicated()]

Unnamed: 0,Date,Source,Campaign,Impressions,Spend,Clicks,AdGroup,Ad
755,2023-07-23,Bloggers,,0,0.0,0,,
789,2023-07-24,Bloggers,,0,0.0,0,,
844,2023-07-25,Bloggers,,0,0.0,0,,
899,2023-07-26,Bloggers,,0,0.0,0,,
958,2023-07-27,Bloggers,,0,0.0,0,,
...,...,...,...,...,...,...,...,...
20745,2024-06-21,Bloggers,,0,0.0,0,,
20746,2024-06-21,Facebook Ads,,0,0.0,0,,
20750,2024-06-21,SMM,,0,0.0,0,,
20764,2024-06-21,Telegram posts,,0,0.0,0,,


In [14]:
# удаление дубликатов
spend_df.drop_duplicates(subset = spend_df, inplace=True)

In [15]:
spend_df.isnull().sum()

Unnamed: 0,0
Date,0
Source,0
Campaign,4984
Impressions,0
Spend,0
Clicks,0
AdGroup,5818
Ad,5818


In [16]:
spend_df['Campaign'].value_counts()

Unnamed: 0_level_0,count
Campaign,Unnamed: 1_level_1
12.07.2023wide_DE,2073
02.07.23wide_DE,1685
04.07.23recentlymoved_DE,1398
youtube_shorts_DE,1223
07.07.23LAL_DE,1181
03.07.23women,1171
12.09.23interests_Uxui_DE,1143
15.07.23b_DE,529
24.09.23retargeting_DE,504
performancemax_eng_DE,355


In [17]:
spend_df["Ad"].unique()

array([nan, 'b3', 'b1', 'b4', 'b2', 'v2', 'v1', 'b4com', 'b3com', 'b2com',
       'b1com', 'v6com', 'v5', 'v4com', 'v3com', 'v5com', 'ad4', 'ad1',
       'ad2', 'ad3', 'v8com', 'v7com', 'ad6', 'ad5', 'bloggersvideo1com',
       'v9com', 'ad9', 'ad8', 'ad_blogger_1', 'ad_blogger_2', 'ad7',
       'web_b3', 'web_b5', 'web_b1', 'web_b4', 'web_b2', 'ad_blogger_3',
       'v10com', 'bloggersvideo2com', 'b5', 'b6', 'b8', 'b7', 'v3', 'v10',
       'v12', 'v11com', 'v11', 'ad_gov_1', 'ad_da_1', 'b3comwebdev',
       'bloggersvideo2comwebdev', 'v11comwebdev', 'b1comwebdev',
       'b2comwebdev', 'bloggersvideo4com', 'bloggersvideo3com',
       'bloggersvideo5', 'promo2', 'promo1', 'ad_blogger_4',
       'bloggersvideo4', 'b10', 'b11', 'b12', 'ad_blogger_6', 'promo3',
       'b15blackfriday', 'b14blackfriday', 'b13blackfriday', 'b7webinar',
       'b6webinar', 'b4webinar', 'b5webinar', 'bloggersvideo6blackfriday',
       'bloggersvideo6webinar', 'bloggersvideo7blackfriday',
       'bloggersvideo

In [18]:
spend_df["AdGroup"].unique()

array([nan, 'women', 'wide', 'interest_programming', 'recentlymoved',
       'interest_dataanalytics', 'interest_work',
       'interest_programming – Copy', 'interest_dataanalytics – Copy',
       'LAL1', 'b', 'Com_july_1', 'interest_all', 'Com_august',
       'interest_work_WebDev', 'interest_programming_WebDev',
       'promoposts_b', 'retargeting', 'interest_python-developer',
       'berlin_wide', 'Com_march'], dtype=object)

In [19]:
spend_df.to_parquet('drive/MyDrive/Final_Project_ICH/Clean_file/SpendClean.parquet')
spend_df.to_csv('drive/MyDrive/Final_Project_ICH/Clean_file/SpendClean.csv', index=False)