# **MONTHLY SPENDING DATA CLEANING**

In [1]:
import re
import pandas as pd

## **1. DATA LOADING**

In [2]:
df = pd.read_csv('monthly_spending_dataset.csv')

In [3]:
df.head()

Unnamed: 0,id,monthly_spending
0,3D2E7EA4,48000 к
1,85C27C17,29000 - 39000 тр
2,362E1CB2,151 тыс руб
3,64B537AB,130000 - 166000 т р
4,524D168B,13 тр


In [4]:
print('Number of observations:', len(df))

Number of observations: 10928


**— Checking missing values.**

In [5]:
df.isnull().sum()

id                  0
monthly_spending    0
dtype: int64

## **2. DATA CLEANING**

**— Viewing possible options for the text component of ```'monthly_spending'```.**

In [6]:
df['no_digit_part'] = df['monthly_spending'].apply(lambda x: ''.join([i for i in x.lower() if not i.isdigit()]))

In [7]:
print(df['no_digit_part'].unique())

[' к' ' -  тр' ' тыс руб' ' -  т р' ' тр' ' k' ' тыс' ' т' ' -  тыс р'
 ' -  тыс' ' т р' ' -  к' ' тыс р' ' -  k' '' ' -  т' ' -  тыс руб'
 ' +  тыс' ' тысяч' ' тысяч р' ' -  тысяч' ' к +  к' ' -  т руб' ' +  тр'
 ' тысяч руб' '. - . м' ' - , мил' ' +  тысяч рублей' ' -  тысяч р'
 ' тысяч рублей' ' тыс рублей' ' тыс -  мил' ' тыс - , м' ' тыс р -  мил'
 '. - . мил' ', - , м' ' -  тысяч рублей' '.- . млн' ' - . м' '. - . млн'
 ', - , млн' ' тыс -  млн' ' тр -  млн' ' + ']


**— Cleaning montly spending values by multiplying if there is a plus, and finding the average if there is a dash.**

In [8]:
def clean_monthly_spending(cell): 
    
    ms_str = str(cell).lower().replace(',', '.')
    ms_str = ms_str.replace('рублей', '').replace('руб', '').replace('р', '')
    
    plus = '+' in ms_str
    from_to = '-' in ms_str
    
    ms_str = ms_str.replace('к', 'k').replace('тысяч', 'k').replace('тыс', 'k').replace('т', 'k') \
                .replace('млн', 'mm').replace('мил', 'mm').replace('м', 'mm')
    
    digits = re.findall(r'[-+]?\d*\.\d+|\d+', ms_str)
    
    ms = 0
    
    if len(digits) == 1 and 'k' not in ms_str and 'mm' not in ms_str:
        ms += float(digits[0])
    
    if len(digits) == 1 and float(digits[0]) < 1000 and 'k' in ms_str:
        ms += float(digits[0]) * 1000
        
    if len(digits) == 1 and float(digits[0]) >= 1000 and 'k' in ms_str:
        ms += float(digits[0])
    
    if len(digits) == 1 and float(digits[0]) < 1000000 and 'mm' in ms_str:
        ms += float(digits[0]) * 1000000
        
    if len(digits) == 1 and float(digits[0]) >= 1000000 and 'mm' in ms_str:
        ms += float(digits[0])
    
    if len(digits) == 2 and plus == True and 'k' not in ms_str and 'mm' not in ms_str:
        ms += float(digits[0]) + float(digits[1])
    
    if len(digits) == 2 and plus == True and 'k' in ms_str and 'mm' not in ms_str and float(digits[0]) < 1000 and float(digits[1]) < 1000:
        ms += (float(digits[0]) + float(digits[1])) * 1000
    
    if len(digits) == 2 and plus == True and 'k' in ms_str and 'mm' not in ms_str and float(digits[0]) >= 1000 and float(digits[1]) >= 1000:
        ms += float(digits[0]) + float(digits[1])
    
    if len(digits) == 2 and plus == True and 'mm' in ms_str and 'k' not in ms_str and float(digits[0]) < 1000000 and float(digits[1]) < 1000000:
        ms += (float(digits[0]) + float(digits[1])) * 1000000
    
    if len(digits) == 2 and plus == True and 'mm' in ms_str and 'k' not in ms_str and float(digits[0]) >= 1000000 and float(digits[1]) >= 1000000:
        ms += float(digits[0]) + float(digits[1])
    
    if len(digits) == 2 and plus == True and 'k' in ms_str and 'mm' in ms_str and float(digits[0]) < 1000 and float(digits[1]) < 1000000:
        ms += float(digits[0]) * 1000 + float(digits[1]) * 1000000
    
    if len(digits) == 2 and plus == True and 'k' in ms_str and 'mm' in ms_str and float(digits[0]) >= 1000 and float(digits[1]) >= 1000000:
        ms += float(digits[0]) + float(digits[1])
    
    if len(digits) == 2 and from_to == True and 'k' not in ms_str and 'mm' not in ms_str:
        ms += (float(digits[0]) + float(digits[1])) / 2
    
    if len(digits) == 2 and from_to == True and 'k' in ms_str and 'mm' not in ms_str and float(digits[0]) < 1000 and float(digits[1]) < 1000:
        ms += (float(digits[0]) + float(digits[1])) * 1000 / 2
    
    if len(digits) == 2 and from_to == True and 'k' in ms_str and 'mm' not in ms_str and float(digits[0]) >= 1000 and float(digits[1]) >= 1000:
        ms += (float(digits[0]) + float(digits[1])) / 2
    
    if len(digits) == 2 and from_to == True and 'mm' in ms_str and 'k' not in ms_str and float(digits[0]) < 1000000 and float(digits[1]) < 1000000:
        ms += (float(digits[0]) + float(digits[1])) * 1000000 / 2
        
    if len(digits) == 2 and from_to == True and 'mm' in ms_str and 'k' not in ms_str and float(digits[0]) >= 1000000 and float(digits[1]) >= 1000000:
        ms += (float(digits[0]) + float(digits[1])) / 2

    if len(digits) == 2 and from_to == True and 'k' in ms_str and 'mm' in ms_str and float(digits[0]) < 1000 and float(digits[1]) < 1000000:
        ms += (float(digits[0]) * 1000 + float(digits[1]) * 1000000) / 2
    
    if len(digits) == 2 and from_to == True and 'k' in ms_str and 'mm' in ms_str and float(digits[0]) >= 1000 and float(digits[1]) >= 1000000:
        ms += (float(digits[0]) + float(digits[1])) / 2
    
    if plus == True and from_to == True:
        ms = 0
    
    return ms

In [9]:
df['cleaned_monthly_spending'] = df['monthly_spending'].apply(clean_monthly_spending)

## **3. FUNCTION APPLICATION TESTING**

In [10]:
df[['monthly_spending', 'cleaned_monthly_spending']].sample(10)

Unnamed: 0,monthly_spending,cleaned_monthly_spending
9978,143 тыс руб,143000.0
9218,79 k,79000.0
10576,258000 - 358000 тыс,308000.0
9716,150000,150000.0
4267,28 k,28000.0
297,43000 к,43000.0
9252,331 тыс,331000.0
8053,103000 т,103000.0
9905,182000 - 186000 тыс руб,184000.0
1739,15 тыс,15000.0


In [11]:
print('Mean Monthly Spending:', round(df['cleaned_monthly_spending'].mean(), 1))
print('Median Monthly Spending:', round(df['cleaned_monthly_spending'].median(), 1))
print('Min Monthly Spending:', round(df['cleaned_monthly_spending'].min(), 1))
print('Max Monthly Spending:', round(df['cleaned_monthly_spending'].max(), 1))

Mean Monthly Spending: 125653.9
Median Monthly Spending: 85500.0
Min Monthly Spending: 9000.0
Max Monthly Spending: 1485000.0
