In [15]:
import pandas as pd 
from GmailAnalyser.GmailClient import GMailClient
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline

df = pd.read_csv('/Users/remy.deme/PycharmProjects/readMail/GmailAnalyser/people_forums_mail.csv', delimiter=',')

In [None]:
import spacy
import en_core_web_sm

print(f'Spacy version {spacy.__version__}')
nlp = en_core_web_sm.load()
stop_words = spacy.lang.en.STOP_WORDS
punctuations = spacy.lang.punctuation.LIST_PUNCT


In [None]:
noisy_words = ['pt', 'gq', 'ml', 'kn', 'si','pl','cx', 'sl', 'lj', 'mr', 'okz', 'nrhj', 'giz', 'pl', 'ax', 'qs', 'ww', 'mt', 'sy', 'sp', 'hz', 'ck', 'ck', 'uw', 'cx', 'jb', 'tt', 'pl', 'uw', 'cx', 'nhl', 'sl', 'pp']
def prepareText(text, punctuation=True, lemming=True, stop_word=True):
    """
    Prepare the text by removing punctuation, stop words and doing lemming 
    :param text: 
    :return: text 
    """     
    clean_text = nlp(text)
    
    #lowering word
    #lemming 
    # if words is pronoun don't apply lemming because spacy convert the words 
    # in "_PRON-" 
    if lemming == True:
        clean_text = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in clean_text]
    
    #remove stop words
    if stop_word == True:
        clean_text = [ word for word in clean_text if (word not in stop_words) ]
    # remove punctuation 
    if punctuation == True:
        clean_text = [word for word in clean_text if word.isalpha() ]
    
    #remove single char [b-Z] we only keep 'a'
    clean_text = [ word for word in clean_text if (len(word) != 1 and word != 'a') ]

    clean_text = [ word for word in clean_text if (word not in noisy_words)]


    return clean_text

In [14]:
vectorizer = CountVectorizer(tokenizer=prepareText)
lda = LatentDirichletAllocation(n_components=20)
pipeline = Pipeline([('vectorizer',vectorizer), ('lda',lda)])

0    b"Le gain d=E2=80=99acquisition d=E2=80=99ESPP...
1    b'Pff, je ne comprends rien =C3=A0 ces d=C3=A9...
2    b'SuKAmWFpIGFwcHLDqWNpw6kgbGUgY2FkZWF1IMOpZ2Fs...
3    b"=F0=9F=99=8F=F0=9F=99=8C\r\n\r\nOn Thu, 11 J...
4    b"Super. Merci\r\n\r\nLe jeu. 11 juin 2020 =C3...
5    b"Hello, je trouve que vous  ne mettez pas du ...
Name: body, dtype: object

In [3]:
df.body.fillna(' ', inplace=True)

In [8]:
def transformBody(message):
    body = ""
    splited = message.split('\n')
    for line in splited:
        print(line)
        if line[0] != '>':
            body  += line + "\n"
    return body
            
            

In [17]:
df.info()            
        

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               200 non-null    int64 
 1   body                     198 non-null    object
 2   X-Spam-Checked-In-Group  197 non-null    object
 3   X-Original-Sender        198 non-null    object
 4   To                       196 non-null    object
 5   Date                     198 non-null    object
 6   From                     198 non-null    object
 7   In-Reply-To              178 non-null    object
 8   Delivered-To             198 non-null    object
 9   Cc                       174 non-null    object
dtypes: int64(1), object(9)
memory usage: 15.8+ KB


In [9]:
result = df.body.apply(transformBody)

Pff, je ne comprends rien =C3=A0 ces d=C3=A9clarations et il ne me reste qu=
e 10 '

De ce que je crois comprendre du guide en page 15, si j'ai souscrit au plan
ESPP, et que je n'ai rien fait / vendu, c'est g=C3=A9r=C3=A9 automatiquemen=
t, les
"revenus" en nature sont pris en compte dans ma d=C3=A9claration -> donc je=
 n'ai
rien =C3=A0 d=C3=A9clarer en plus
c'est cela ?
il me reste 5', argh, mais apr=C3=A8s j'aurais surement le droit =C3=A0 l'e=
rreur
<https://www.impots.gouv.fr/portail/particulier/je-corrige-mes-erreurs-0>;-=
)

Pr=C3=A9visualiser la pi=C3=A8ce jointe Accenture - IR 2019 - Guide pratiqu=
e.pdf
Accenture - IR 2019 - Guide pratique.pdf
3.6 MB
<https://mail.google.com/mail/u/0?ui=3D2&ik=3D26d2ee32eb&attid=3D0.1&permms=
gid=3Dmsg-f:1668561881306363310&th=3D1727ec30cd05f5ae&view=3Datt&disp=3Dsaf=
e&realattid=3Df_kazh2gar0>







----

*Dominique* *Lequepeys | *Coach Lean Startup

+33 (0)6 28 82 36 49

..................................

IndexError: string index out of range

In [36]:
df.head()

Unnamed: 0.1,Unnamed: 0,body,X-Spam-Checked-In-Group,X-Original-Sender,To,Date,From,In-Reply-To,Delivered-To,Cc
0,0,\n\n\n,people@octo.com,dlequepeys@octo.com,people <people@octo.com>,"Thu, 11 Jun 2020 23:56:32 +0200",Dominique Lequepeys <dlequepeys@octo.com>,<CAL2k6PHpZNL-Vhkd48ej08LB4MWk0BTpYjSvrwkxPb7-...,remy.deme@octo.com,
1,1,\n\n\n,people@octo.com,nsafta@octo.com,"Pierre Nicoli <pnicoli@octo.com>, Fabrice Gout...","Thu, 11 Jun 2020 20:34:46 +0000",Nidhal Mohamed Safta <nsafta@octo.com>,<CAC+_vnrj6O=vSn6CY5-V2vpb-=NXazbCaGpz2CPd+wT2...,remy.deme@octo.com,
2,2,\n\n\n,people@octo.com,ablangero@octo.com,Matthieu Vetter <mvetter@octo.com>,"Thu, 11 Jun 2020 21:06:09 +0200",Annabelle Blangero <ablangero@octo.com>,<CA+oaERYxZrsU2nksvB6BRPdxgsXaaoTcT7h4OP3PLo-S...,remy.deme@octo.com,"BCorp <bcorp@octo.com>, ""corp.france"" <corp.fr..."
3,3,\n\n\n,people@octo.com,mvetter@octo.com,people@octo.com,"Thu, 11 Jun 2020 20:35:21 +0200",Matthieu Vetter <mvetter@octo.com>,<CAJWkGDccoSPAnHRi7nA3J-FX8ZFGzJPtwqBrw3Qo8xGx...,remy.deme@octo.com,"BCorp <bcorp@octo.com>, ""corp.france"" <corp.fr..."
4,4,\n\n\n,people@octo.com,flenci@octo.com,Alexandre Siguier <asiguier@octo.com>,"Thu, 11 Jun 2020 20:31:27 +0200",Frederic Lenci <flenci@octo.com>,<CAPCO-zmtetXCHyKtGu++B0A0Xq1M85g32-s+gM8nH8bT...,remy.deme@octo.com,"Florent Jaby <fjaby@octo.com>, Christophe Thib..."


### Analyse the 

In [133]:
threadId = df.threadId.unique()

In [141]:
threadId = df.threadId.unique()
for id in threadId:
    count = df[df['threadId'] == id ].size
    df.loc[df['threadId'] == id,'importance_thread'] = count
    
    


In [143]:
df.head(100)



Unnamed: 0.1,Unnamed: 0,id,threadId,labelIds,snippet,payload,sizeEstimate,historyId,internalDate,importance_thread
0,0,17275bc3cce6497f,172659e4e9356de7,"['UNREAD', 'IMPORTANT', 'CATEGORY_FORUMS', 'IN...",Félicitations Clara :) Quel petit chou et quel...,"{'partId': '', 'mimeType': 'multipart/related'...",8309988,215485,1591113147000,171.0
1,1,17275b282bd7e104,172659e4e9356de7,"['UNREAD', 'IMPORTANT', 'CATEGORY_FORUMS', 'IN...","Félicitation, Bienvenue à Gaspard, il est tout...","{'partId': '', 'mimeType': 'multipart/alternat...",62110,215403,1591112492000,171.0
2,2,17275469b37bf18f,172750d9af65a610,"['UNREAD', 'CATEGORY_FORUMS', 'INBOX']","Bonjour, @Marina Guerin : je pense que nous po...","{'partId': '', 'mimeType': 'multipart/related'...",72555,215282,1591105443000,100.0
3,3,17275421059326b5,172750d9af65a610,"['UNREAD', 'CATEGORY_FORUMS', 'INBOX']",d&#39;ailleurs tout ça me fait penser à une vi...,"{'partId': '', 'mimeType': 'multipart/related'...",69817,215282,1591105131000,100.0
4,4,172753fcd88090be,172750d9af65a610,"['UNREAD', 'CATEGORY_FORUMS', 'INBOX']",Merci pour les photos Cyril ! Cette année ça r...,"{'partId': '', 'mimeType': 'multipart/related'...",69589,215282,1591104999000,100.0
...,...,...,...,...,...,...,...,...,...,...
95,95,1725bec1c28531e7,1725bec1c28531e7,"['UNREAD', 'CATEGORY_FORUMS', 'INBOX']","Hello les octos, A l&#39;occasion des 2 ans du...","{'partId': '', 'mimeType': 'multipart/alternat...",17972,204210,1590680085000,10.0
96,96,1725bb666c1502a2,172511f6e19ce721,"['UNREAD', 'IMPORTANT', 'CATEGORY_FORUMS', 'IN...","Bisous bibiiii 😘 On Tue, May 26, 2020 at 3:15 ...","{'partId': '', 'mimeType': 'multipart/alternat...",26158,204110,1590676566000,330.0
97,97,1725bb33cf323512,172511f6e19ce721,"['UNREAD', 'IMPORTANT', 'CATEGORY_FORUMS', 'IN...","Marie, ça a été un réel plaisir de travailler ...","{'partId': '', 'mimeType': 'multipart/related'...",158590,204018,1590676331000,330.0
98,98,1725b8eaa895820e,172511f6e19ce721,"['UNREAD', 'IMPORTANT', 'CATEGORY_FORUMS', 'IN...",Merci pour tout Marie. Je te souhaite le meill...,"{'partId': '', 'mimeType': 'multipart/alternat...",64791,203774,1590673961000,330.0


In [162]:
client = GMailClient(path_to_credentials='/Users/remy.deme/PycharmProjects/readMail/credentials.json')
client.init()# initialise the client

Failed to create service an error occured 


In [None]:
message = client.GetMessage(userID='me',msgID='172a269baef28ffb')


