<h1> Name: Eshan Mehrotra

In [1]:
import pandas as pd

## Data Read and Setup

In [2]:
df = pd.read_csv('data/tennis.txt', sep='\s+')
df

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,d1,sunny,hot,high,weak,no
1,d2,sunny,hot,high,strong,no
2,d3,overcast,hot,high,weak,yes
3,d4,rainy,mild,high,weak,yes
4,d5,rainy,cool,normal,weak,yes
5,d6,rainy,cool,normal,strong,no
6,d7,overcast,cool,normal,strong,yes
7,d8,sunny,mild,high,weak,no
8,d9,sunny,cool,normal,weak,yes
9,d10,rainy,mild,normal,weak,yes


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   day       14 non-null     object
 1   outlook   14 non-null     object
 2   temp      14 non-null     object
 3   humidity  14 non-null     object
 4   wind      14 non-null     object
 5   play      14 non-null     object
dtypes: object(6)
memory usage: 800.0+ bytes


In [4]:
df.describe()

Unnamed: 0,day,outlook,temp,humidity,wind,play
count,14,14,14,14,14,14
unique,14,3,3,2,2,2
top,d1,sunny,mild,high,weak,yes
freq,1,5,6,7,8,9


In [5]:
df.index

RangeIndex(start=0, stop=14, step=1)

In [6]:
df.columns

Index(['day', 'outlook', 'temp', 'humidity', 'wind', 'play'], dtype='object')

In [7]:
df = df.set_index('day')
df.head()

Unnamed: 0_level_0,outlook,temp,humidity,wind,play
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
d1,sunny,hot,high,weak,no
d2,sunny,hot,high,strong,no
d3,overcast,hot,high,weak,yes
d4,rainy,mild,high,weak,yes
d5,rainy,cool,normal,weak,yes


In [8]:
df.index

Index(['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8', 'd9', 'd10', 'd11',
       'd12', 'd13', 'd14'],
      dtype='object', name='day')

## Function Definitions

In [9]:
def parse_features(s):
    '''
    Parse the string s to return the expressions dictionaries of Given and Event params.
    Event and Given params are seperated by |.
    Individual expression has feature name and value seperated by =
    Feature Name is Key and it's Value is the Dictionary Value
    '''
    is_cond = False
    if ("|" in s):
        is_cond = True
        expeve,expgiv = [e.strip() for e in s.strip().split("|")]
        givenlst = [y.strip() for x in expgiv.split(",") for y in x.split("=")]
    else:
        expeve = s.strip()
        
    eventlst = [y.strip() for x in expeve.split(",") for y in x.split("=")]
    evedict = {eventlst[i]:eventlst[i+1] for i in range(0,len(eventlst),2)}
    
    if is_cond:
        givdict = {givenlst[i]:givenlst[i+1] for i in range(0,len(givenlst),2)}
    else:
        givdict = {}
    
    return (evedict,givdict)

In [10]:
def compute_exp(ed,gd):
    '''
    Computes the Event Mask and Given mask and returns them as strings from the event and given dictionaries 
    '''
    eventsexp_str = " & ".join([str("(df['" + k + "']=='" + v + "')") for k,v in ed.items()])
    
#   Check if gd dict is not empty - Empty when no conditional prob
    if (gd):    
        givexp_str = " & ".join([str("(df['" + k + "']=='" + v + "')") for k,v in gd.items()])
    else:
        givexp_str = ''
        
    return (eventsexp_str,givexp_str)

In [11]:
def probability(df):
    '''
    Takes the Dataframe as input and return a function which can compute probability of any expression
    using the p(s) function based on the data in the df.
    '''
    def p(s):
        '''
        Takes the string s which contains the expression and calculates probability of that expression based
        on the Dataframe passed to probability(df).
        Whitespaces in s are handled by parse_features(s).
        compute(exp) generates the masks for the dataframe based on the expression passed. 
        '''
        eventdict,givendict = parse_features(s)
        event_mask,given_mask = compute_exp(eventdict,givendict)
                 
        if ("|" in s):
            # P(Event|Given) = 0 if P(Given) = 0
            if (len(df[eval(given_mask)])==0):
                ans = 0
            else:
                ans = len(df[eval(event_mask + ' & '+ given_mask)])/len(df[eval(given_mask)])
        else:
            ans = len(df[eval(event_mask)])/len(df)
        return ans
    return p

## Probability Calculations - Function Tests

In [12]:
p = probability(df)
print( p('play=yes') )

0.6428571428571429


In [13]:
p = probability(df)
print( p(' play = no ') )

0.35714285714285715


In [14]:
p = probability(df)
print(p('outlook = sunny | play = yes'))

0.2222222222222222


In [15]:
p=probability(df)
print(p('play=no | outlook=sunny, wind = weak'))

0.6666666666666666


In [16]:
print(p('outlook=rainy | play=no'))

0.4


In [17]:
print(p('outlook=sunny | play=yes'))

0.2222222222222222


In [18]:
print(p('play=no, outlook=sunny | wind = weak, humidity=high'))

0.5


In [19]:
print(p('play=no | temp=hot, humidity=high, outlook=rainy'))

0


## Archived Functions based on iterations

<!-- # def probability(df):
#     '''
#     WRITE DOCSTRING HERE!
#     '''
#     def p(s):
#         pexp = [x.strip() for x in s.strip().split("=")]
#         ans = len(df[(df[pexp[0]]==pexp[1])])/len(df[pexp[0]])
#         return ans 
#     return p -->

In [20]:
# def probability(df):
#     '''
#     WRITE DOCSTRING HERE!
#     '''
#     def p(s):
#         pexp = [x.strip() for x in s.strip().split("=")]
#         ans = len(df[(df[pexp[0]]==pexp[1])])/len(df[pexp[0]])
#         return ans 
#     return p

In [21]:
# def probability(df):
#     '''
#     WRITE DOCSTRING HERE!
#     '''
#     def p(s):
#         event,given = parse_features(s)
#         event_mask = (df[event[0]]==event[1]) & (df[given[0]]==given[1])
#         given_mask = (df[given[0]]==given[1])
#         if (len(df[given_mask])==0):
#             return 0
#         else:
#             ans = len(df[event_mask])/len(df[given_mask])
#             return ans 
#     return p

In [22]:
# def probability(df):
#     '''
#     WRITE DOCSTRING HERE!
#     '''
#     def p(s):
#         eventdict,givendict = parse_features(s)
#         event_mask,given_mask = compute_exp(eventdict,givendict)
        
#         if (len(df[eval(given_mask)])==0):
#             return 0
#         else:
#             ans = len(df[eval(event_mask + ' & '+ given_mask)])/len(df[eval(given_mask)])
#             return ans 
#     return p

In [23]:
# def parse_features(s):
#     '''
#     Parse the string s to return the expressions lists of the Given and Event params.
#     Event and Given params are seperated by |.
#     Individual expression has feature name and value seperated by =
#     '''
#     expeve,expgiv = [e.strip() for e in s.strip().split("|")]
#     eventlst = [y.strip() for x in expeve.split(",") for y in x.split("=")]
#     givenlst = [y.strip() for x in expgiv.split(",") for y in x.split("=")]
#     evedict = {eventlst[i]:eventlst[i+1] for i in range(0,len(eventlst),2)}
#     givdict = {givenlst[i]:givenlst[i+1] for i in range(0,len(givenlst),2)}
#     return (evedict,givdict)
# #     return [[y.strip() for y in x.strip().split("=") ] for x in s.strip().split("|")]

In [24]:
# def compute_exp(ed,gd):
#     eventsexp_str = " & ".join([str("(df['" + k + "']=='" + v + "')") for k,v in ed.items()])
#     givexp_str = " & ".join([str("(df['" + k + "']=='" + v + "')") for k,v in gd.items()])
#     return (eventsexp_str,givexp_str)