In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.manifold import MDS
import re

In [2]:
import spacy
from spacy.lang.en import stop_words

In [3]:
# Documents
doc = pd.read_csv('edmunds_extraction.csv', header=None)
doc

Unnamed: 0,0,1,2
0,wishnhigh1,April 2002,The problem is that they are HUGE generalizat...
1,kd6aw1,April 2002,Have found out that with some of the more pow...
2,fwatson,April 2002,How does your theory explain English cars? A ...
3,dave330i,April 2002,"""Being that it is an automatic I can enjoy my..."
4,blueguydotcom,April 2002,"He did mention ""in rush hour traffic."" Like t..."
...,...,...,...
5145,laurasdada,May 2006,"I had a '99 300M, loved it. New 300? Well, th..."
5146,ivorypearlg,May 2006,I've been in the 300M and 300C... Much differ...
5147,shipo,May 2006,"I don't know, maybe it's just me or maybe my ..."
5148,dhanley,May 2006,"Actually, the m35x gets 17/24 mpg, same as th..."


In [None]:
#pip install nltk

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\TYS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [6]:
doc.columns = ["users", "time", "txt"]
doc.head()

Unnamed: 0,users,time,txt
0,wishnhigh1,April 2002,The problem is that they are HUGE generalizat...
1,kd6aw1,April 2002,Have found out that with some of the more pow...
2,fwatson,April 2002,How does your theory explain English cars? A ...
3,dave330i,April 2002,"""Being that it is an automatic I can enjoy my..."
4,blueguydotcom,April 2002,"He did mention ""in rush hour traffic."" Like t..."


In [9]:
# tokenize and lower case
import nltk
from nltk import word_tokenize

tokens = []
for i in range(len(doc)):
    tokens.append(word_tokenize(doc.loc[i,'txt'].lower()))
doc['txt_tokenized'] = tokens

In [11]:
doc.head()

Unnamed: 0,users,time,txt,txt_tokenized
0,wishnhigh1,April 2002,The problem is that they are HUGE generalizat...,"[the, problem, is, that, they, are, huge, gene..."
1,kd6aw1,April 2002,Have found out that with some of the more pow...,"[have, found, out, that, with, some, of, the, ..."
2,fwatson,April 2002,How does your theory explain English cars? A ...,"[how, does, your, theory, explain, english, ca..."
3,dave330i,April 2002,"""Being that it is an automatic I can enjoy my...","[``, being, that, it, is, an, automatic, i, ca..."
4,blueguydotcom,April 2002,"He did mention ""in rush hour traffic."" Like t...","[he, did, mention, ``, in, rush, hour, traffic..."


In [17]:
attributes = pd.read_csv('Attributes.csv', header=0)
attributes

Unnamed: 0,Feature,Mapping
0,luxury,comfort
1,room,comfort
2,tall,comfort
3,interior,comfort
4,interiors,comfort
...,...,...
104,replacement,maintenance
105,suspension,performance
106,transmission,performance
107,transmissions,performance


In [19]:
features = [feature.lower() for feature in attributes['Feature'].tolist()]
mapping = [map_item.lower() for map_item in attributes['Mapping'].tolist()]
    

In [23]:
mapping_dict = {}

for i in range(len(features)):
    mapping_dict[features[i]] = mapping[i]

### Remove stopwords/punctuations

In [20]:
stop_words = list(stop_words.STOP_WORDS)
punc = ['.',',','!','?',';',':',"'","''", '"','``','(',')','[',']','{','}','-','...','–', '/','\\','&','%','$','#','@','+',
    '*','=','>','<','|','~','_']

In [21]:
def remove_stopwords(lst):
    tokens_wo_stopwords = []
    for word in lst:
        if word not in stop_words:
            if word not in punc:
                tokens_wo_stopwords.append(word)
    return tokens_wo_stopwords

### Replace feature words into to mapping words

In [24]:
def features_to_mapping(lst):
    
    new_lst = []
    for word in lst:
        new_lst.append(mapping_dict.get(word, word))
        
    return new_lst

In [27]:
doc["txt_token_mapped"] = doc["txt_tokenized"].apply(features_to_mapping)

In [29]:
doc["txt_wo_stopwords"] = doc["txt_token_mapped"].apply(remove_stopwords)

### Attribute Frequency

In [31]:
txt_combined = []

for i in range(len(doc)):
    txt_combined.extend(doc.loc[i, "txt_wo_stopwords"]) 

In [33]:
freq_dist = dict(nltk.FreqDist(word for word in txt_combined))
freq_dist

{'problem': 197,
 'huge': 69,
 'generalizations': 4,
 'talking': 203,
 'japanese': 196,
 'exclusivly': 1,
 'honda': 229,
 'toyota': 131,
 'subaru': 140,
 'mazda': 76,
 'isuzu': 4,
 'mitsubishi': 19,
 'daihatsu': 2,
 'suzuki': 12,
 'far': 290,
 'makes': 289,
 'culture': 5,
 'american': 139,
 'european': 88,
 'capable': 46,
 'having': 193,
 'higher': 134,
 'quality': 205,
 '70': 58,
 'cars': 1963,
 'help': 99,
 'generalization': 5,
 'stop': 108,
 'brands': 89,
 'nations': 1,
 'found': 144,
 'powerful': 93,
 'like': 1835,
 'g35': 1327,
 'corvette': 40,
 'difference': 315,
 'performance': 8905,
 'practical': 25,
 'standpoint': 12,
 'issue': 145,
 'faster': 180,
 'dream': 17,
 'driving': 774,
 'killing': 4,
 'getting': 324,
 'tickets': 3,
 'time': 603,
 'love': 268,
 'enjoy': 126,
 'coffee': 6,
 'shave': 4,
 'talk': 90,
 'cel': 3,
 'phone': 34,
 'hug': 5,
 'wife': 146,
 'rush': 18,
 'hour': 28,
 'traffic': 115,
 'feel': 419,
 'urge': 2,
 'shift': 92,
 'manumatic': 12,
 'pretend': 5,
 'stick

In [36]:
features_dist = {}
features_list = attributes['Mapping'].tolist()

for feature in features_list:
    features_dist[feature] = freq_dist.get(feature, None)


In [37]:
features_dist

{'comfort': 1999,
 'console': 491,
 'efficiency': 978,
 'performance': 8905,
 'safety': 881,
 'styling': 715,
 'maintenance': 371}

In [39]:
top_5_features_dist = dict(sorted(features_dist.items(), key=lambda item: item[1], reverse=True)[:5])
top_5_features_dist

{'performance': 8905,
 'comfort': 1999,
 'efficiency': 978,
 'safety': 881,
 'styling': 715}

In [45]:
# Convert to DataFrame with keys as a separate column
top_5_features_dist = pd.DataFrame(list(top_5_features_dist.items()), columns=['Feature', 'Frequency'])
top_5_features_dist

Unnamed: 0,Feature,Frequency
0,performance,8905
1,comfort,1999
2,efficiency,978
3,safety,881
4,styling,715


### Lift Matrix

In [40]:
brand = pd.read_csv('top10_brands.csv', header=0)

In [44]:
top_5_brands = brand[:5]
top_5_brands

Unnamed: 0,brand
0,bmw
1,acura
2,infiniti
3,audi
4,mercedes


In [46]:
pairs = []
for i in range(len(top_5_brands['brand'])):
    for j in range(i + 1,len(top_5_features_dist['Feature'])):
        pair = (top_5_brands['brand'][i], top_5_features_dist['Feature'][j])
        pairs.append(pair)

In [47]:
# calculate presence of each pair in txt
pairs_freq_dict = {}
for pair in pairs:
    pairs_freq_dict[pair] = 0

In [48]:
txt_wo_stopwords_lst = doc["txt_wo_stopwords"].tolist()

for key, value in pairs_freq_dict.items():
    brand = key[0]
    feature = key[1]
    for lst in txt_wo_stopwords_lst:
        if brand in lst and feature in lst:
            pairs_freq_dict[key] = pairs_freq_dict[key] + 1

pairs_freq_dict

{('bmw', 'comfort'): 319,
 ('bmw', 'efficiency'): 117,
 ('bmw', 'safety'): 137,
 ('bmw', 'styling'): 152,
 ('acura', 'efficiency'): 29,
 ('acura', 'safety'): 45,
 ('acura', 'styling'): 60,
 ('infiniti', 'safety'): 31,
 ('infiniti', 'styling'): 41,
 ('audi', 'styling'): 46}

In [49]:
#lift calculation: P(AB)/(P(A)*P(B))
lift_dict = {}


total_txt = len(doc)

for key, value in pairs_freq_dict.items():
    
    AB = value
    count_A = top_5_features_dist[key[0]]
    count_B = top_5_brands[key[0]]
    
    lift_dict[key] = total_txt*(AB)/(count_A*count_B)

KeyError: 'bmw'

In [None]:
lift_dict