# Data Understanding – MoMo Top-up Case Study

In [1]:
import pandas as pd
import re
from pathlib import Path

In [2]:
path = 'C:/Users/ASUS/Documents/Project/Momo case study/Data/raw/momo_top_up.xlsx'

In [3]:
df_transactions = pd.read_excel(path, sheet_name=0)
df_transactions.head(5)

Unnamed: 0,user_id,order_id,Date,Amount,Merchant_id,Purchase_status
0,21269588,4169517626,2020-01-01,10000,13,
1,28097592,4170276686,2020-01-01,20000,13,
2,47435144,4166729310,2020-01-01,10000,12,
3,29080935,4174460303,2020-01-01,10000,13,
4,14591075,4168216749,2020-01-01,10000,12,


In [4]:
a = df_transactions['Date']

mask = pd.to_datetime(a, errors='coerce').isna()

a[mask].unique()

array(['27/9/2020', '28/9/2020', '29/9/2020', '30/9/2020'], dtype=object)

In [5]:
def normalize_date_string(series: pd.Series) -> pd.Series:
    def _normalize(x):
        if pd.isna(x):
            return pd.NA

        x = str(x).strip()

        m = re.match(r"^(\d{4})[-/](\d{1,2})[-/](\d{1,2})$", x)
        if m:
            y, mth, d = m.groups()
            return f"{y}-{int(mth):02d}-{int(d):02d}"
        m = re.match(r"^(\d{1,2})[-/](\d{1,2})[-/](\d{4})$", x)
        if m:
            d, mth, y = m.groups()
            return f"{y}-{int(mth):02d}-{int(d):02d}"

        return pd.NA

    return series.apply(_normalize)

In [6]:
df_transactions['Date'] = normalize_date_string(df_transactions['Date'])
df_transactions['Date'].value_counts()

Date
2020-06-30    79
2020-07-30    75
2020-11-30    75
2020-05-30    72
2020-06-10    70
              ..
2020-03-11    23
2020-03-06    22
2020-01-11    22
2020-01-12    21
2020-01-07    13
Name: count, Length: 366, dtype: int64

In [7]:
df_transactions['Date'] = pd.to_datetime(df_transactions['Date'],format='mixed',errors= 'coerce',dayfirst= True)
print(df_transactions['Date'].isnull().sum())

0


In [8]:
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13495 entries, 0 to 13494
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   user_id          13495 non-null  int64         
 1   order_id         13495 non-null  int64         
 2   Date             13495 non-null  datetime64[ns]
 3   Amount           13495 non-null  object        
 4   Merchant_id      13495 non-null  int64         
 5   Purchase_status  2235 non-null   object        
dtypes: datetime64[ns](1), int64(3), object(2)
memory usage: 632.7+ KB


In [9]:
df_transactions['Amount'] = df_transactions['Amount'].astype(str).str.replace(',', '').astype(float)

In [10]:
df_transactions['Purchase_status'] = (df_transactions['Purchase_status'] == 'Mua hộ').astype(int)

## Quan sát df_user info ta thấy 


In [11]:
df_user_info = pd.read_excel(path, sheet_name=2)

df_user_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13428 entries, 0 to 13427
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User_id          13428 non-null  int64 
 1   First_tran_date  13428 non-null  object
 2   Location         13428 non-null  object
 3   Age              13428 non-null  object
 4   Gender           13428 non-null  object
dtypes: int64(1), object(4)
memory usage: 524.7+ KB


In [12]:
s = df_user_info['First_tran_date']

mask = pd.to_datetime(s, errors='coerce').isna()

s[mask].unique()

array(['9918-01-01', '9917-10-05', '9919-09-02', '9917-09-28',
       '9917-10-31', '9917-09-25', '9918-06-06', '9917-10-06',
       '9917-09-23', '9917-10-04', '9919-03-17', '9919-04-07',
       '9919-05-21', '9920-03-12', '9919-07-02', '9919-11-20',
       '9917-10-10', '9917-10-22', '9918-07-20', '9917-11-10',
       '9917-10-01', '9917-09-22', '9918-11-15', '9919-10-06',
       '9920-03-10', '9917-10-20', '9917-09-27', '9917-10-09',
       '9917-10-26', '9920-03-04', '9920-03-24', '9917-10-11',
       '3020-05-28', '9917-10-07', '9920-03-05', '9918-04-16',
       '9919-12-14', '3020-12-01', '9920-02-14', '9920-04-27',
       '9919-03-22', '9920-12-16', '9920-05-31', '9918-09-27',
       '9919-09-15', '9920-12-12', '9918-02-28', '9917-12-27',
       '9918-10-25', '9918-10-01', '9917-12-01', '9920-12-06',
       '9919-11-18', '9919-11-06', '9919-03-19', '9920-01-03',
       '9918-02-16', '9919-02-09', '9918-12-29', '9917-12-20',
       '9919-09-14', '9917-11-01'], dtype=object)

In [13]:
def fix_corrupted_year(series: pd.Series) -> pd.Series:
    s = series.astype(str).str.strip()

    def _fix(x):
        m = re.match(r'^(\d{4})([-/].*)$', x)
        if not m:
            return x

        y, rest = m.groups()
        y_int = int(y)

        if y_int > 2030 or y_int < 2000:
            y = "20" + y[-2:]

        return y + rest

    return s.apply(_fix)

In [14]:
df_user_info['First_tran_date'] = fix_corrupted_year(df_user_info['First_tran_date'])
df_user_info['First_tran_date'] = normalize_date_string(df_user_info['First_tran_date'])

In [15]:
df_user_info['First_tran_date'] = pd.to_datetime(df_user_info['First_tran_date'], format='mixed',errors= 'coerce',dayfirst= True)


print(df_user_info['First_tran_date'].isnull().sum())


0


In [16]:
df_user_info.rename(columns={'User_id': 'user_id'}, inplace=True)

In [17]:
print(df_user_info['Location'].value_counts(dropna=False).sort_index())


Location
HCMC                4112
HN                  1437
Ho Chi Minh City      63
Other               1033
Other Cities        6022
Unknown              761
Name: count, dtype: int64


In [18]:
mapping_location = {
    'Ho Chi Minh City': 'HCMC',   
    'Other': 'Other Cities',      
    'Unknown': 'Other Cities'     
}

df_user_info['Location'] = df_user_info['Location'].replace(mapping_location)

print(df_user_info['Location'].value_counts())

Location
Other Cities    7816
HCMC            4175
HN              1437
Name: count, dtype: int64


In [19]:
print(df_user_info['Gender'].value_counts(dropna=False).sort_index())

Gender
FEMALE    3407
M           92
MALE      6282
Nam       1370
Nữ        1248
f           55
female     903
male        71
Name: count, dtype: int64


In [20]:
df_user_info['Gender'] = df_user_info['Gender'].str.upper()

mapping_gender = {
    'NỮ': 'FEMALE',
    'F': 'FEMALE',
    'NAM': 'MALE',
    'M': 'MALE'
}
df_user_info['Gender'] = df_user_info['Gender'].replace(mapping_gender)
print(df_user_info['Gender'].value_counts())

Gender
MALE      7815
FEMALE    5613
Name: count, dtype: int64


In [21]:
df_commission = pd.read_excel(path, sheet_name=1)

In [22]:
df_transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13495 entries, 0 to 13494
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   user_id          13495 non-null  int64         
 1   order_id         13495 non-null  int64         
 2   Date             13495 non-null  datetime64[ns]
 3   Amount           13495 non-null  float64       
 4   Merchant_id      13495 non-null  int64         
 5   Purchase_status  13495 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(4)
memory usage: 632.7 KB


In [23]:
out_dir = Path("C:/Users/ASUS/Documents/Project/Momo case study/Data/processed")
out_dir.mkdir(parents=True, exist_ok=True)
df_transactions.to_csv(out_dir / "df_transactions.csv", index=False, encoding="utf-8-sig")
df_user_info.to_csv(out_dir / "df_users.csv",        index=False, encoding="utf-8-sig")
df_commission.to_csv(out_dir / "df_products.csv",     index=False, encoding="utf-8-sig")