In [98]:
import pandas as pd
import numpy as np
import re
from itertools import chain, combinations

In [99]:
def CV(x):
    """
    x: Series object
    """
    return(x.std()/x.mean())

In [100]:
def SplitValue(x):
    """
    x: string object
    """
    if pd.isna(x) == True: return pd.NA
    
    str_len = len(x.split())
    if str_len == 0:
        return pd.NA
    else:
        return x.split()[0]

In [101]:
def SplitUnit(x):
    """
    x: string object
    """
    if pd.isna(x) == True: return pd.NA
    
    str_len = len(x.split())
    if str_len <= 1:
        return pd.NA
    else:
        return "".join(x.split()[1:])

In [102]:
def Density(x):
    """
    x: string object
    """
    if pd.isna(x) == True: return np.NaN
    elif x=="Diesel": return(0.83)
    elif x=="LPG": return(0.55)
    elif x=="Petrol": return(0.74)
    else: return(1) #Since all CNG has mileage in km/kg already (no need to convert)

In [103]:
def TorqueVal(x):
    """
    x: string object
    """
    if pd.isna(x) == True: return np.NaN
    
    if re.findall(r'[0-9,.]+', x) == []: return np.NaN
    else: return re.findall(r'[0-9,.]+', x)[0]

In [104]:
TorqueVal("123sth 111 @")

'123'

In [105]:
def TorqueUnit(x):
    """
    x: string object
    """
    if pd.isna(x) == True: return pd.NA
    find_list = re.findall(r'(kgm|nm)', x.lower())
    if len(find_list) == 0: return pd.NA
    elif ("kgm" in find_list) & ("nm" in find_list): return pd.NA
    else: return find_list[0]

In [106]:
TorqueUnit("380Nm@ 1750-2750rpm")

'nm'

In [107]:
def AbsCorrGvName(x,y):
    """
    x and y are both Series.
    x is the predictor, y is the target.
    """
    
    if(len(x) == 1): return 0
    
    x_std = round(x.std(), 10)
    y_std = round(y.std(), 10)
    
    if( (x_std==0) | (y_std==0) ): return 0
    return abs(y.corr(x))

In [108]:
def FindNameinX(x, name):
    "x: string object"
        
    res = x.find(name)
    if res == -1: return False
    else: return True

In [109]:
FindNameinX("abc hjkhd", "bch")

False

In [110]:
def Extract_brand(x):
    """
    x is a string
    """
    if x.find("Land Rover") == 0:
        return x.split()[0] + " " + x.split()[1]
    else:
        return x.split()[0]

In [111]:
Extract_brand("TLand Rover 223 jo")

'TLand'

In [112]:
def Extract_brand_model(x):
    """
    x is a string
    """
    if x.find("Land Rover") == 0:
        return x.split()[0] + " " + x.split()[1] + " " + x.split()[2]
    else:
        return x.split()[0] + " " + x.split()[1]

In [113]:
Extract_brand_model("Land Rover 223 jo")

'Land Rover 223'

In [114]:
def PowerSet_exclude_empty(x):
    """
    Generate the power set of the array x.
    Return: A List of subsets (each tuple in the list is a subset of x)
    """
    # chain.from_iterable is used to flatten the list of combinations
    return list(chain.from_iterable(combinations(x, r) for r in range(1, len(x)+1)))

In [115]:
power_set = PowerSet_exclude_empty(["one","two","three"])
subset = PowerSet_exclude_empty(["one","two","three"])[3]

In [116]:
def Min_at_element(ar):
    """
    Note: ar is a pd.Series
    """
    
    minn = min(ar)
    pos = 0
    for val in ar:
        if val == minn: 
            return pos
            break
        pos += 1

In [117]:
x = pd.Series([1,-1,1,2,-2])
Min_at_element(x)

4

In [118]:
#Note: Save this script before running it in Data_Cleaning.ipynb