Here are listed all the modules, libraries, imports and constant used for the exercise

In [97]:
from EnumReviews import enumReviews
from math import log
import string

FILE_INPUT = "reviews.txt"

This function is used to read the file as a list of list of words (strings) and label (integer) 

In [98]:
def load_file(path2file:str)->list[list[str, int]]:
    data = []
    
    with open(path2file, "r", encoding="UTF-8") as fp:
        fp.readline()

        for line in fp:        
            line=line.strip().split(",")
            data.append([",".join(line[:-1]), int(line[-1])])
          
    return data

This function applies the tokenization <br>
Compute the tokens for each document.<br>
Input: a list of strings. Each item is a document to tokenize.
<br>Output: a list of lists. Each item is a list containing the tokens of the
relative document.

In [99]:
def tokenize(data:list[list[str, int]])->list[list[str]]:
    enum = enumReviews()
    tokens = []
    
    for doc in data:
        doc = doc[enum.DESC]
        for punct in string.punctuation:
            doc = doc.replace(punct, " ")
        split_doc = [token.lower() for token in doc.split(" ") if token]
        tokens.append(split_doc)
        
    return tokens


This function counts the number of occurrence of words in a list of tokens

In [100]:
def countFrequencies(token:list[list[str]])->list[dict[str:int]]:
    # sourcery skip: inline-immediately-returned-variable, move-assign-in-block
    # No good it's cubic !
    # return [{word:token[i].count(word) for word in token[i]} for i in range(len(token))]

    # This is quadratic, which is better
    data = []
    for text in token:
        diz = dict()
        for word in text:
            if word in diz:
                diz[word]+=1
            else:
                diz[word]=1
        data.append(diz)
    
    return data
    

This function computes the DF and the IDF for each word in the text and returns the list in ascending order

In [101]:
def computeIDF(freq:list[dict[str:int]]) -> dict[str:float]:
    numberOfDoc = len(freq)
    df = dict()
    
    for doc in freq:
        for word in doc:
            if word in df:
                df[word]+=1
            else:
                df[word]=1
                
    return {word:log(numberOfDoc/df[word]) for word in df}

This function computes the TF-IDFt,d = TFt,d × IDFt <br>
In other words, TF-IDFt,d assigns to term t a weight in document d that is
- high when t occurs many times within a small number of documents;
- low when the term occurs fewer times in a document, or occurs in many documents (thus
offering a less pronounced relevance signal);
- lowest when the term occurs in virtually all documents

In [102]:
def computeTF_IDF(freq:list[dict[str:int]], idf:dict[str:float])->list[dict[str:float]]:
    return [{item:idf[item]*doc[item] for item in doc} for doc in freq]

This function separates documents into positive and negatives:

In [103]:
def separate_documents(data:list[list[str, int]])->list[set[int], set[int]]:
    positive=set()
    negative=set()
    enum = enumReviews()
    
    for i in range(len(data)):
        if data[i][enum.LABEL]:
            positive.add(i)
        else:
            negative.add(i)
    
    return [positive, negative]
    

Compute the L2-norm of a vector representation

In [104]:
def norm(d:dict[str:float])->float:
    return sum(tf_idf**2 for t, tf_idf in d.items())**.5

Compute the dot product between two vector representations

In [105]:
def dot_product(d1:dict[str:float], d2:dict[str:float])->float:
    word_set = set(list(d1.keys()) + list(d2.keys()))
    return sum(d1.get(d, 0.0) * d2.get(d, 0.0) for d in word_set)

Compute the cosine similarity between documents d1 and d2.<br>
Input: two dictionaries representing the TF-IDF vectors for documents d1 and d2.<br>
Output: the cosine similarity.

In [106]:
def cosine_similarity(d1:dict[str:float], d2:dict[str:float]):
    return dot_product(d1, d2) / (norm(d1) * norm(d2))

This function computes the sentiment analysis, so, if a given document is closer to positive or to negative reviews:
<br>To do so, the cosine similarity is introduced: $\text{cos sim}(d1,d2) = \frac{V(d1) * V(d2)}{|V(d1)|\cdot|V(d2)|}$


In [107]:
def sentiments_analysis(tf_idf_doc:dict[str:float], tf_idf:list[dict[str:float]], positive:set[int], negative:set[int], skip:int)->None:
    print(skip, end=' ' if skip%50 else '\n')
    return (1 
        if sum(cosine_similarity(tf_idf_doc, tf_idf[i]) for i in positive if i != skip) > 
           sum(cosine_similarity(tf_idf_doc, tf_idf[i]) for i in negative if i != skip) 
        else 0)

This is the main function of the program: <ol>
<li>Creates the object enumerator for items</li>
<li>Reads the input file</li>
<li>Tokenize each document (reviews) read</li>
<li>Compute the frequency of each token inside each document</li>
<li>Computes the IDF (and DF) for each word in all documents</li>
<li>Computes the TF_IDF for each word of each document</li>
<li>Separates the document (starting from start) into positive and negatives one</li>
<li>Computes the sentiment analysis on a given document</li>
<li>Prints the accuracy of the model</li>
</ol>

In [108]:
def main() -> None:
    enum = enumReviews()
    data = load_file(FILE_INPUT)
    token = tokenize(data)
    frequencies = countFrequencies(token)
    idf = computeIDF(frequencies)

    #for word in dict(sorted(idf.items(), key=lambda x:x[1], reverse=True)):
    #    print(word, "\t", idf[word])

    tf_idf = computeTF_IDF(frequencies, idf)
    [positive, negative] = separate_documents(data)

    sentiment = [sentiments_analysis(tf_idf[i], tf_idf, positive, negative, i) for i in range(len(tf_idf))]
    
    print(f'Accuracy: {sum(1 for i in range(len(sentiment)) if sentiment[i] == data[i][enum.LABEL]) / len(data) * 100:.2f}%')
    
    
main()

0
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240
241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 Accuracy: 74.90%
