In [292]:
import pandas as pd
from math import isnan
import re
import numpy as np
from googlesearch import search
import random
import matplotlib.pyplot as plt

In [331]:
Categories = ['主食类','佐料类','水产类','水果果干类','蔬菜类','动物肉类']
D = {'name':[],'p':[],'category':[]}
for category in Categories:
    temp = pd.read_csv(category + '.csv')
    for row in range(temp.shape[0]):
        L = list(temp.loc[row])
        for element in L:
            if isinstance(element,str):
                try:
                    float(element)
                    D['p'].append(element)
                except ValueError:
                    if element in Categories:
#                         print("***")
#                         print(element)
#                         print("***")
                        pass
                    elif element[0] == '<':
                        D['p'].append(float(element[1:]))
                    else:    
#                         print(element)
                        D["name"].append(element)
                        D["category"].append(category)
            elif isinstance(element,float) and not isnan(element):
                #print(element)
                D['p'].append(element)

In [332]:
Df = pd.DataFrame(D)
Df.to_csv("食物嘌呤.csv")

In [333]:
class Purine:
    def __init__(self,df):
        self.__df__ = df
        for idx in self.__df__.index:
            self.__df__.loc[idx]['p'] = float(self.__df__.loc[idx]['p'])
        
    def find(self,kw):
        Temp = re.compile(f"\w*{kw}\w*")
        indexes =  [idx for idx in self.__df__.index if Temp.search(self.__df__.loc[idx]['name']) is not None]
        return self.__df__.loc[indexes]
    
    def __repr__(self):
        return str(self.__df__)
    
    def __getitem__(self,index):
        return self.__df__.loc[index]
    
    def match(self,KWs):
        indexes = []
        for idx in self.__df__.index:
            for kw in KWs:
                if re.search(f"\w*{kw}\w*",self.__df__.loc[idx]['name']) is not None:
                    indexes.append(idx)
                    break
        return self.__df__.loc[indexes]
    

In [334]:
P = Purine(Df)
P.find('猪')

Unnamed: 0,name,p,category
218,猪血,11.8,动物肉类
219,猪皮,29.8,动物肉类
221,猪心,65.3,动物肉类
222,猪脑,66.3,动物肉类
229,瘦猪肉,122.5,动物肉类
231,猪肚,132.4,动物肉类
232,猪腰子,132.6,动物肉类
233,猪肉,132.6,动物肉类
239,猪肝,169.5,动物肉类
242,猪大肠,262.2,动物肉类


In [335]:
print("Average")
for cat in Categories:
    tmp = np.average(Df.loc[Df.category == cat].p)
    print(f'{cat}:{tmp}')

Average
主食类:25.589743589743588
佐料类:162.16363636363636
水产类:281.7066666666667
水果果干类:15.016666666666664
蔬菜类:54.15081967213115
动物肉类:197.8806451612903


# Data Enginearing

### 1. Add Weather Organ Or Not

In [336]:
#Df = pd.read_csv("食物嘌呤.csv")

In [337]:
organKW = ['肝','脾','心','肚','信','肠','腰']
res = P.match(organKW)
IsOrgan = []
for idx in Df.index:
    if idx in res.index and Df.loc[idx].category == '动物肉类':
        #print(Df.loc[idx])
        IsOrgan.append(1)
    else:
        IsOrgan.append(0)

In [338]:
Df['IsOrgan'] = IsOrgan
Df = pd.get_dummies(Df,columns=['category'])
display(Df)

Unnamed: 0,name,p,IsOrgan,category_主食类,category_佐料类,category_动物肉类,category_水产类,category_水果果干类,category_蔬菜类
0,牛奶,1.4,0,1,0,0,0,0,0
1,皮蛋白,2.0,0,1,0,0,0,0,0
2,红薯,2.4,0,1,0,0,0,0,0
3,鸡蛋黄,2.6,0,1,0,0,0,0,0
4,荸荠,2.6,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
244,猪脾,270.6,1,0,0,1,0,0,0
245,鸡肝,293.5,1,0,0,1,0,0,0
246,鸭肝,301.5,1,0,0,1,0,0,0
247,熏羊脾,773.0,1,0,0,1,0,0,0


### Data Prep

In [339]:
def idx_split(idx_max,portion_train = 0.8):
    L = [i for i in range(idx_max)]
    random.shuffle(L)
    cut_off = round(idx_max*portion_train)
    return L[:cut_off],L[cut_off:]

In [340]:
train_idx,test_idx = idx_split(249)
train = Df.loc[train_idx]
test = Df.loc[test_idx]

In [341]:
for i in range(1,8):
    exec(f"x{i}_train = train[Df.columns[{i+1}]]")
    exec(f"x{i}_test = test[Df.columns[{i+1}]]")

In [342]:
X_train = []
X_test = []
for i in range(1,8):
    exec(f"X_train.append(np.array(x{i}_train))" )
    exec(f"X_test.append(np.array(x{i}_test))" )
# print(len(X_test))
# print(len(X_train))
X_train.append(np.ones((len(X_train[0]),)))
X_train = np.array(X_train).T
Y_train = np.array(train.p)
X_test.append(np.ones((len(X_test[0]),)))
X_test = np.array(X_test).T
Y_test = np.array(test.p)
# print(X_test.shape)
# print(X_train.shape)

# Linear Regression

In [352]:
def line_reg_grad_dec(X,Y,n_feature,theta = None,eta = 0.002,n_iter = 1e3,tol = 1-3,plot = False):
    """
    @eta:learing rate
    @n_iter:number of interations before stopping 
    @theta:the initial guess for the coefficient
    """
    if theta is None:
        theta = np.random.randn(X.shape[1],1)
    def Transform_fun(X:list,n_feature=1):
        """
        Transform X into a poly nomial matrix M, where M[:,i] := X^(n-i)
        """
        X = np.array(X)
        A = np.zeros((X.size,n_feature+1))
        for i in range(n_feature+1):
            A[:,i] = (X**(i)).reshape(len(X))
        return np.array(A)
    #_____________________________________________________#
    Y = Y.reshape(-1,1)
    A = X# Transform_fun(X,n_feature)
    for i in range(int(n_iter)):
        gradient = A.T@((A@theta).reshape(-1,1)-Y)
        temp = theta
        theta = theta.reshape(-1,1) - eta*gradient
        if np.linalg.norm(temp-theta) < tol:
            plt.show()
            return theta
        if all([plot,(i+1)%(n_iter/4) == 0]):
            xx = np.linspace(min(X),max(X),100)
            AA = Transform_fun(xx,n_feature)
            yy = AA@theta
            plt.plot(xx,yy,'-.')
    plt.show()
    return theta

In [380]:
features = ['IsOrgan','category_主食类','category_佐料类','category_动物肉类','category_水产类','category_水果果干类','category_蔬菜类']
beta = line_reg_grad_dec(X_train,Y_train,7)
def intepret(Y,features,beta,intersect_idx = -1,decimals = 2):
    beta = list(beta.reshape(-1,))
    res = f"{Y} = {np.round(beta[intersect_idx],decimals)}+"
    beta.pop(intersect_idx)
    for feature,coef in zip(features,beta):
        #print(feature,coef)
        res += str(np.round(coef,decimals)) + "*" + str(feature)
    return res
print(intepret("Purine",features,beta))

Purine = 103.53+61.66*IsOrgan-78.86*category_主食类52.54*category_佐料类71.59*category_动物肉类208.9*category_水产类-87.68*category_水果果干类-60.97*category_蔬菜类


In [384]:
X = np.array(Df[features])
X = np.concatenate([X,np.ones((X.shape[0],1))],axis = 1)
Y_pred = X@beta
Df['Y_pred'] = Y_pred
display(Df)

Unnamed: 0,name,p,IsOrgan,category_主食类,category_佐料类,category_动物肉类,category_水产类,category_水果果干类,category_蔬菜类,Y_pred
0,牛奶,1.4,0,1,0,0,0,0,0,24.67714
1,皮蛋白,2.0,0,1,0,0,0,0,0,24.67714
2,红薯,2.4,0,1,0,0,0,0,0,24.67714
3,鸡蛋黄,2.6,0,1,0,0,0,0,0,24.67714
4,荸荠,2.6,0,1,0,0,0,0,0,24.67714
...,...,...,...,...,...,...,...,...,...,...
244,猪脾,270.6,1,0,0,1,0,0,0,236.781702
245,鸡肝,293.5,1,0,0,1,0,0,0,236.781702
246,鸭肝,301.5,1,0,0,1,0,0,0,236.781702
247,熏羊脾,773.0,1,0,0,1,0,0,0,236.781702


In [407]:
def R_sqr(y_true,y_pred):
    mean = np.average(y_true)
    Rss = sum([(y_true[idx]-y_pred[idx])**2 for idx in range(len(y_true))])
    Tss = sum(list((y_pred- np.average(y_pred))**2))
    print(Rss)
    print(Tss)
    return 1 - Rss/Tss

In [408]:
R_sqr(Y_pred,Df.p)

[17082264.839006256]
19829397.672851413


array([0.13853839028132853], dtype=object)