In [2]:
import os
import nltk
import os
import codecs
import re
from nltk import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import numpy
from sklearn.feature_extraction.text import CountVectorizer
NEWLINE = '\n'
SKIP_FILES = {'cmds'}

#Text Processing
stemmer = PorterStemmer()

#Text Processing
def preprocess(raw) :
    wordlist = word_tokenize(raw)
    returnlist = []
    for word in wordlist :
        if (word not in stopwords.words("english")) :
            if (word[0] in (('1','2','3','4','5','6','7','8','9','0','&','(','-','_','ç','é','à',')','=','+','°','~','#'))) :
                returnlist.append('#')
            else :
                returnlist.append(stemmer.stem(word))
    return returnlist

def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    lines = []
                    f = open(file_path, encoding="latin-1")
                    for line in f:
                        lines.append(line)

                    f.close()
                    content = NEWLINE.join(lines)
                    yield file_path, content

In [3]:
from pandas import DataFrame


def build_data_frame(path, classification):
    rows = []
    index = []
    for file_name, text in read_files(path):
        rows.append({'text': text, 'class': classification})
        index.append(file_name)

    data_frame = DataFrame(rows, index=index)
    return data_frame

In [4]:
HAM = 'ham'
SPAM = 'spam'
""",
    ('/home/techmen/Desktop/IA/dataset/enron2/spam',    SPAM),
    ('/home/techmen/Desktop/IA/dataset/enron3/spam',    SPAM),
    ('/home/techmen/Desktop/IA/dataset/enron4/spam',      SPAM),
    ('/home/techmen/Desktop/IA/dataset/enron5/spam',    SPAM),
    ('/home/techmen/Desktop/IA/dataset/enron6/spam',  SPAM),
    ('/home/techmen/Desktop/IA/dataset/enron1/ham',        HAM),
    ('/home/techmen/Desktop/IA/dataset/enron2/ham',    HAM),
    ('/home/techmen/Desktop/IA/dataset/enron3/ham',    HAM),
    ('/home/techmen/Desktop/IA/dataset/enron4/ham',      HAM),
    ('/home/techmen/Desktop/IA/dataset/enron5/ham',    HAM),
    ('/home/techmen/Desktop/IA/dataset/enron6/ham',  HAM),"""
SOURCES = [
    ('/home/toutou/Téléchargements/data/enron1/spam',        SPAM),
    ('/home/toutou/Téléchargements/data/enron1/ham',        HAM)
]

data = DataFrame({'text': [], 'class': []})
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))

data = data.reindex(numpy.random.permutation(data.index))

data

Unnamed: 0,class,text
/home/toutou/Téléchargements/data/enron1/ham/0638.2000-03-20.farmer.ham.txt,ham,Subject: @ ect . enron . com email notificatio...
/home/toutou/Téléchargements/data/enron1/ham/2506.2000-10-10.farmer.ham.txt,ham,Subject: union carbide - seadrift\n\ndaren\n\n...
/home/toutou/Téléchargements/data/enron1/ham/0479.2000-02-24.farmer.ham.txt,ham,Subject: midcon invoices\n\nkellie -\n\ni rese...
/home/toutou/Téléchargements/data/enron1/spam/2285.2004-09-26.GP.spam.txt,spam,Subject: fw : hungry 30 to 40 girls wants to d...
/home/toutou/Téléchargements/data/enron1/ham/5063.2001-11-12.farmer.ham.txt,ham,Subject: pipelines that still have dial in acc...
/home/toutou/Téléchargements/data/enron1/ham/2925.2000-11-22.farmer.ham.txt,ham,Subject: re : cornhusker\n\nthanks for the inf...
/home/toutou/Téléchargements/data/enron1/ham/0907.2000-04-11.farmer.ham.txt,ham,Subject: re : fyi - wellhead portfolio\n\nwho ...
/home/toutou/Téléchargements/data/enron1/spam/4922.2005-07-25.GP.spam.txt,spam,Subject: u . s . robotics - analogue / wired c...
/home/toutou/Téléchargements/data/enron1/spam/4478.2005-05-12.GP.spam.txt,spam,Subject: fda approved\n\nwe are one of the top...
/home/toutou/Téléchargements/data/enron1/ham/0188.2000-01-12.farmer.ham.txt,ham,Subject: re : hl & p for 12 / 99\n\nit is the ...


In [5]:
count_vectorizer = CountVectorizer()
counts = count_vectorizer.fit_transform(data['text'].values)


In [6]:
#fILTRE BAISIEN MultinomialNB
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [7]:
examples = ["Subject: christmas tree farm pictures\n"]
example_counts = count_vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['spam'], 
      dtype='<U4')

In [8]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vectorizer',  CountVectorizer()),
    ('classifier',  MultinomialNB()) ])

pipeline.fit(data['text'].values, data['class'].values)
p=pipeline.predict(examples)

In [9]:
p

array(['spam'], 
      dtype='<U4')

In [10]:
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score

k_fold = KFold(n=len(data), n_folds=6)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label=SPAM)
    scores.append(score)

print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)



Total emails classified: 5172
Score: 0.96115983853
Confusion matrix:
[[3628   44]
 [  72 1428]]


In [11]:
#fILTRE BAISIEN BernoulliNB
from sklearn.naive_bayes import BernoulliNB

classifierb = BernoulliNB()
targets = data['class'].values
classifierb.fit(counts, targets)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [12]:
predictions = classifierb.predict(example_counts)
predictions

array(['ham'], 
      dtype='<U4')

In [14]:
pipelineb = Pipeline([
    ('vectorizer',  CountVectorizer()),
    ('classifier',  BernoulliNB()) ])

pipelineb.fit(data['text'].values, data['class'].values)
pb=pipelineb.predict(examples)

In [15]:
pb

array(['ham'], 
      dtype='<U4')

In [16]:
k_fold = KFold(n=len(data), n_folds=4)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values
    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label=SPAM)
    scores.append(score)

print('Total emails classified:', len(data))
print('Score:', sum(scores)/len(scores))
print('Confusion matrix:')
print(confusion)

Total emails classified: 5172
Score: 0.959691336923
Confusion matrix:
[[3627   45]
 [  75 1425]]


In [17]:
from sklearn import svm
from sklearn.svm import SVC

In [18]:
classifiersvm = SVC(kernel='linear', C=100)
targets = data['class'].values
classifiersvm.fit(counts, targets)


SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [19]:
predictions = classifiersvm.predict(example_counts)
predictions

array(['ham'], dtype=object)

In [21]:
pipelinesvm = Pipeline([
    ('vectorizer',  CountVectorizer()),
    ('classifier',  SVC()) ])

pipelinesvm.fit(data['text'].values, data['class'].values)
psvm=pipelinesvm.predict(examples)

In [22]:
psvm

array(['ham'], dtype=object)

In [23]:
k_fold = KFold(n=len(data), n_folds=6)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values
    pipelinesvm.fit(train_text, train_y)
    predictions = pipelinesvm.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label=SPAM)
    scores.append(score)

print('total  des emails classifiés:', len(data))
print('Score:', sum(scores)/len(scores))
print('Matrice de Confusion :')
print(confusion)

"""Total des  emails classifiés: 5172
Score: 0.145314516992
matrice de Confusion :
[[3665    7]
 [1381  119]] avec C =10 aussi pour C=1"""

"""total  des emails classifiés: 5172
Score: 0.14916197263
Matrice de Confusion :
[[3662   10]
 [1378  122]] pour C=100 ce qui montre que notre scores'ameliore avec l'augmentation de C bien comme la théorie l'annonce """

total  des emails classifiés: 5172
Score: 0.14916197263
Matrice de Confusion :
[[3662   10]
 [1378  122]]


"total  des emails classifiés: 5172\nScore: 0.151798421428\nMatrice de Confusion :\n[[3665    7]\n [1376  124]] pour C=100 ce qui montre que notre scores'ameliore avec l'augmentation de C bien comme la théorie l'annonce "

In [82]:
confusion[0][0]

3665

In [30]:
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer



In [35]:
targets = data['class'].values
vectorizer = TfidfVectorizer(stop_words='english')

X = vectorizer.fit_transform(data['text'].values)

true_k = 2
classifiekm = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
classifiekm.fit(X,targets)
cl0=[]
cl1=[]
print("les mots qui apparaissent le plus sur :")
Top_Mots = classifiekm.cluster_centers_.argsort()[:, ::-1]
Termes = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in Top_Mots[i, :30]:
        print(' %s' % Termes[ind])
        if i==0:
            cl0.append(Termes[ind])
        else :
            cl1.append(Termes[ind])
        


print("\n\n")
print(cl0)
print("\n\n")
print(cl1)
colors = ['b', 'g']
markers = ['0', '', 's']



#classifierkmeans.fit(counts, targets)


les mots qui apparaissent le plus sur :
Cluster 0:
 subject
 deal
 gas
 com
 meter
 00
 000
 thanks
 http
 know
 2000
 mmbtu
 nomination
 need
 2001
 new
 daren
 let
 enron
 day
 01
 attached
 10
 hpl
 th
 time
 www
 price
 want
 message
Cluster 1:
 ect
 enron
 hou
 hpl
 2000
 xls
 000
 subject
 nom
 file
 teco
 2001
 attached
 tap
 hplno
 cc
 pm
 actuals
 hplo
 corp
 gas
 10
 01
 meter
 deal
 03
 forwarded
 02
 daren
 30



['subject', 'deal', 'gas', 'com', 'meter', '00', '000', 'thanks', 'http', 'know', '2000', 'mmbtu', 'nomination', 'need', '2001', 'new', 'daren', 'let', 'enron', 'day', '01', 'attached', '10', 'hpl', 'th', 'time', 'www', 'price', 'want', 'message']



['ect', 'enron', 'hou', 'hpl', '2000', 'xls', '000', 'subject', 'nom', 'file', 'teco', '2001', 'attached', 'tap', 'hplno', 'cc', 'pm', 'actuals', 'hplo', 'corp', 'gas', '10', '01', 'meter', 'deal', '03', 'forwarded', '02', 'daren', '30']


In [26]:
print("Prediction d'un example:")

Y = vectorizer.transform(["Subject: christmas tree farm pictures\n"])
predictions = classifiekm.predict(Y)
print("Cluster n°:")
print(predictions)

Prediction d'un example:
Cluster n°:
[0]


In [38]:
pipelinekm = Pipeline([
    ('vectorizer',  CountVectorizer()),
    ('classifier',  KMeans()) ])

pipelinekm.fit(data['text'].values)
pkm=pipelinekm.predict(examples)

pkm

array([0], dtype=int32)

In [47]:
data['class'].value_counts()
#data['text'].value_counts()

ham     3672
spam    1500
Name: class, dtype: int64

In [53]:
data['text'].values
worddata =[word_tokenize(c) for c in data['text'].values ]

In [51]:
worddata


[['Subject',
  ':',
  '@',
  'ect',
  '.',
  'enron',
  '.',
  'com',
  'email',
  'notification',
  '!',
  'we',
  'are',
  'one',
  '@',
  'enron',
  '.',
  'com',
  '!',
  'please',
  'be',
  'aware',
  'of',
  'the',
  'following',
  'senders',
  'were',
  'automatically',
  'notified',
  'to',
  '(',
  'a',
  ')',
  '.',
  'stop',
  'sending',
  'internet',
  'mail',
  'to',
  'your',
  '@',
  'ect',
  '.',
  'enron',
  '.',
  'com',
  'address',
  'and',
  'to',
  '(',
  'b',
  ')',
  '.',
  'send',
  'future',
  'internet',
  'communications',
  'to',
  'daren',
  '.',
  'j',
  '.',
  'farmer',
  '@',
  'enron',
  '.',
  'com',
  ':',
  'fpam',
  '_',
  '@',
  'hotmail',
  '.',
  'com',
  ',',
  'mjones',
  '7',
  '@',
  'txu',
  '.',
  'com',
  'reminder',
  ':',
  'your',
  '@',
  'ect',
  '.',
  'enron',
  '.',
  'com',
  'address',
  'should',
  'not',
  'be',
  'used',
  'any',
  'longer',
  'and',
  'will',
  'be',
  'deactivated',
  'soon',
  '.',
  'so',
  'please',
  'm

In [58]:
for cle in data:
    print(cle)

class
text


In [72]:
for elm in data:
    for elm2 in data['class']
        if elm== 'spam':
            print(elm)

class
text


In [76]:
data['text'][0]

'Subject: @ ect . enron . com email notification !\n\nwe are one @ enron . com !\n\nplease be aware of the following senders were automatically notified to ( a ) .\n\nstop sending internet mail to your @ ect . enron . com address and to ( b ) . send\n\nfuture internet communications to daren . j . farmer @ enron . com :\n\nfpam _ @ hotmail . com , mjones 7 @ txu . com\n\nreminder :\n\nyour @ ect . enron . com address should not be used any longer and will be\n\ndeactivated soon . so please make sure these contacts switch to your new\n\n@ enron . com address . if you have subscribed to mailing lists , please make\n\nsure to update your addresses there as well .\n\nand\n\nyour shortname @ enron . com address ( i . e . jsmith @ enron . com ) will continue to\n\nwork , even though your formal address is longname @ enron . com ( i . e .\n\njohn . smith @ enron . com )\n\nplease do not reply to this message as it was automatically generated .'

In [105]:


sp=[]
hm=[]
liste=[]
for i in range(len(data['class'])):
    if data['class'][i]=='spam':
        liste=preprocess(data['text'][i])
        sp.extend(liste)
    else:
        liste=preprocess(data['text'][i])
        hm.extend(liste)
     

In [117]:
#garder queles elments distinctes dans les deux listes
hm = list(set(hm))
sp = list(set(sp))

#matrice contenant la frequence d'apparution de chaque mot 
hm_t=[[x,0] for x in hm]
sp_t=[[x,0] for x in sp]

sp = list(set(sp))
for j in range(1000): #len(hm)
    for i in range(len(data['class'])):  #len(data['class'])-5000
        if data['class'][i]=='ham':
            hm_t[j][1]+=data['text'][i].count(hm[j])
            
            
for j in range(1000): #len(sp)
    for i in range(len(data['class'])):  #len(data['class'])-5000
        if data['class'][i]=='spam':
            sp_t[j][1]+=data['text'][i].count(sp[j])
            
       

In [118]:
hm_t 

[['step', 291],
 ['prepar', 115],
 ['kettler', 1],
 ['tri', 851],
 ['kaufman', 3],
 ['rglover', 2],
 ['?', 2774],
 ['assur', 14],
 ['pasadena', 10],
 ['buyer', 55],
 ['gg', 349],
 ['marriag', 2],
 ['manual', 41],
 ['glenda', 2],
 ['child', 51],
 ['loraleigh', 1],
 ['backgound', 1],
 ['sql', 4],
 ['kleberg', 14],
 ['greed', 49],
 ['solver', 1],
 ['layni', 4],
 ['carrington', 4],
 ['idaho', 1],
 ['exact', 29],
 ['dbaumba', 2],
 ['ne', 14361],
 ['domino', 2],
 ['circumst', 7],
 ['revoir', 1],
 ['citywid', 1],
 ['upto', 1],
 ['tino', 2],
 ['cat', 1603],
 ['fitzgerald', 3],
 ['aspx', 1],
 ['benoitjasonp', 1],
 ['egarden', 1],
 ['persist', 5],
 ['larger', 15],
 ['stuck', 2],
 ['salt', 8],
 ['evp', 1],
 ['clarifi', 27],
 ['hbd', 3],
 ['amerac', 1],
 ['old', 467],
 ['kleb', 16],
 ['bonsai', 1],
 ['baughman', 4],
 ['gasolin', 4],
 ['similarli', 0],
 ['ind', 1221],
 ['whitton', 1],
 ['crcandmac', 1],
 ['spectrum', 3],
 ['swxl', 2],
 ['prod', 1214],
 ['wu', 10],
 ['stenophon', 1],
 ['ident', 231]

In [116]:
len(hm)


12395

In [115]:
len(sp)

31605