In [2]:
import nltk
import pandas as pd
import numpy as np
import sys

ids = pd.read_csv('car_model_identifier.csv', header=0)
ids = ids.sort_values(by=['brand_id'])
data = pd.read_csv('entire_data_Mentioned_4th.csv', header=0).iloc[:, 1:] ## because the first column are indexes

import re
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder()

## Generate a list of unique brand names 

In [3]:
unique_brand=[]
for x in ids.brand:
    if x not in unique_brand:
         unique_brand.append(x)

### Add 40 new columns to the dataframe containing the detected entities and org_id information, each named by a brand name.
### Replace the Nan values in the dataframe with 0's
### Convert the brand_ids to integers.

In [4]:
data=pd.concat([data,pd.DataFrame(columns=unique_brand)])
data = data.replace(np.nan, 0)
data.brand_id = data.brand_id.astype(int)

### For each detection, if the detected brand_id matches with one of the brand_id of a unique brand name, in the dataframe we add 1 to the cell counting the occurence of that brand_id. Then save the dataframe to "Pyshark_brand". Where we have all the information of detections, and the number of the occurence of certain brand name. 

In [7]:
for i in range(len(data)):
    for y in range (40):
        if data.brand_id[i] == y+1:
            data.iloc[i,y+7] = 1 ## because the brand columns starts from the 7th column 

In [27]:
data.to_csv("PyShark_brand.csv")

### Below we only focus on the part that counts the occurences of brand names.

In [8]:
data = pd.read_csv('PyShark_brand.csv', header=0).iloc[:, 1:] ## because the first column are indexes
data_record = data.iloc[0:,6:] 

### Add up the occurences of brand names in a single review (group by org_id), and save this condensed file

In [10]:
data_condensed=data_record.groupby(['org_id']).sum() 
data_condensed.to_csv("PyShark_condensed_brand.csv")

In [16]:
## Data_condensed is a matrix with all brands as its columns, and all comments ids as its rows, and each cell indicates
## the numbers that this brand names has been mentioned in this comment. 

data_condensed = pd.read_csv("PyShark_condensed_brand.csv", header=0).iloc[:, 1:] 

### Replace all values > 1 with 1 because later we will perform multiplication on all possible pairs, and the sum of these multiplications indicates the times that they are co-mentioned within one comment. So we need the non-zero cells to be 1, other wise we will other values such as 2 * 3 = 6 (instead of 1* 1 = 1) which does not make sense in our case. 

In [18]:
data_condensed = data_condensed.apply(lambda x: [y if y <= 1 else 1 for y in x])  ## replace all values > 1 with 1 

In [13]:
from itertools import combinations

class lift_calculator:
    def __init__(self, data):
        self.data = data
        self.brand_count = dict(data.sum(axis=0))
        self.pair_brand_count = dict()
        for brand1, brand2 in combinations(data.columns, 2):
            self.pair_brand_count[(brand1, brand2)] = (data[brand1] * data[brand2]).sum()
        self.lift_dict = dict()
        self.total_size = len(data)
        
    def calculate(self):
        for brand1, brand2 in self.pair_brand_count:
            self.lift_dict[(brand1, brand2)] = self.pair_brand_count[(brand1, brand2)] * self.total_size / (self.brand_count[brand1] * self.brand_count[brand2])
    
    def top_n(self, n=10):
        data = pd.DataFrame(self.lift_dict.items()).fillna(0).sort_values(by=1, ascending=False).iloc[:n, :]
        top_n_pairs = dict(zip(data[0], data[1]))
        return top_n_pairs
    
    def return_csv(self):
        df = pd.DataFrame(index=data.columns, columns=data.columns)
        for brand1 in data.columns:
            for brand2 in data.columns:
                df.loc[brand1, brand2] = self.lift_dict.get((brand1, brand2), 0)
        return df

In [14]:
data = data_condensed 
calculator = lift_calculator(data) ## pass the matrix to lift_calculator
calculator.calculate()

In [15]:
lift_model_df = calculator.return_csv()

## the matrix is an upper triangular matrix without diagonal
lift_model_df.to_csv("PyShark_paired_brand_lift_no_diagonal2.csv")

In [17]:
calculator.top_n(10)

{('DODGE', 'RAM'): 13.223722351236063,
 ('DAEWOO', 'SUZUKI'): 11.726353834844401,
 ('DAEWOO', 'HUMMER'): 11.281294200848656,
 ('HUMMER', 'LAND ROVER'): 8.448398628867345,
 ('JAGUAR', 'MERCEDES-BENZ'): 5.657003985689179,
 ('SAAB', 'VOLVO'): 5.565565946564772,
 ('DAEWOO', 'SCION'): 4.838262056414923,
 ('BUICK', 'OLDSMOBILE'): 4.721198668146504,
 ('AUDI', 'SAAB'): 4.1541838747937145,
 ('LINCOLN', 'MERCURY'): 4.152151135493118}

### follow the instruction from Pyshark  (the AP and FD methed generates the same results except ordered differently) to calculate the lift and cross-validate our results above

### https://pyshark.com/market-basket-analysis-using-association-rule-mining-in-python/

In [57]:
from mlxtend.frequent_patterns import apriori
frequent_itemsets_ap = apriori(data_condensed,min_support = 0.001, use_colnames=True) ## the min_support could be modified

In [58]:
frequent_itemsets_ap

Unnamed: 0,support,itemsets
0,0.062830,(ACURA)
1,0.042393,(AUDI)
2,0.096102,(BMW)
3,0.017443,(BUICK)
4,0.026784,(CADILLAC)
...,...,...
354,0.001081,"(FORD, TOYOTA, HONDA, HYUNDAI)"
355,0.001113,"(FORD, TOYOTA, HONDA, MAZDA)"
356,0.002006,"(NISSAN, FORD, HONDA, TOYOTA)"
357,0.001489,"(NISSAN, TOYOTA, HONDA, HYUNDAI)"


In [51]:
from mlxtend.frequent_patterns import fpgrowth
frequent_itemsets_fp=fpgrowth(data_condensed, min_support=0.001, use_colnames=True) ## the min_support could be modified

In [59]:
frequent_itemsets_fp

Unnamed: 0,support,itemsets
0,0.096102,(BMW)
1,0.062830,(ACURA)
2,0.036454,(INFINITI)
3,0.171596,(HONDA)
4,0.044588,(MERCEDES-BENZ)
...,...,...
354,0.001316,"(MERCURY, HONDA)"
355,0.001238,"(MERCURY, LINCOLN)"
356,0.001458,"(MERCURY, TOYOTA)"
357,0.001411,"(SUZUKI, HONDA)"


### Evaluate the lift based on the mlxtend guide:
### http://rasbt.github.io/mlxtend/user_guide/frequent_patterns/association_rules/

In [60]:
from mlxtend.frequent_patterns import association_rules

rules_ap = association_rules(frequent_itemsets_ap, metric="lift", min_threshold=1.1)
rules_fp = association_rules(frequent_itemsets_fp, metric="lift", min_threshold=1.1) # the min_threshold could be modified

### The two Pyshark methods generate the same results as we did using the dictionary we defined

In [61]:
rules_fp.head(20)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,"(TOYOTA, BMW)",(HONDA),0.006034,0.171596,0.0021,0.348052,2.028327,0.001065,1.27066
1,"(HONDA, BMW)",(TOYOTA),0.006911,0.183679,0.0021,0.303855,1.654272,0.000831,1.172631
2,(TOYOTA),"(HONDA, BMW)",0.183679,0.006911,0.0021,0.011433,1.654272,0.000831,1.004574
3,(HONDA),"(TOYOTA, BMW)",0.171596,0.006034,0.0021,0.012239,2.028327,0.001065,1.006282
4,"(NISSAN, BMW)",(HONDA),0.004451,0.171596,0.001191,0.267606,1.559513,0.000427,1.131091
5,"(HONDA, BMW)",(NISSAN),0.006911,0.105662,0.001191,0.172336,1.631002,0.000461,1.080556
6,(NISSAN),"(HONDA, BMW)",0.105662,0.006911,0.001191,0.011273,1.631002,0.000461,1.004411
7,(HONDA),"(NISSAN, BMW)",0.171596,0.004451,0.001191,0.006941,1.559513,0.000427,1.002508
8,"(NISSAN, BMW)",(TOYOTA),0.004451,0.183679,0.001128,0.253521,1.380241,0.000311,1.093562
9,"(TOYOTA, BMW)",(NISSAN),0.006034,0.105662,0.001128,0.187013,1.769911,0.000491,1.100064
