In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



In [2]:
df=pd.read_csv('E:/practice/emails.csv')

In [3]:
print(df)

                                                   text  spam
0     Subject: naturally irresistible your corporate...     1
1     Subject: the stock trading gunslinger  fanny i...     1
2     Subject: unbelievable new homes made easy  im ...     1
3     Subject: 4 color printing special  request add...     1
4     Subject: do not have money , get software cds ...     1
...                                                 ...   ...
5723  Subject: re : research and development charges...     0
5724  Subject: re : receipts from visit  jim ,  than...     0
5725  Subject: re : enron case study update  wow ! a...     0
5726  Subject: re : interest  david ,  please , call...     0
5727  Subject: news : aurora 5 . 2 update  aurora ve...     0

[5728 rows x 2 columns]


## Removing null values

In [4]:
mail_data=df.where((pd.notnull(df)),'')

In [5]:
print(mail_data.head())

                                                text  spam
0  Subject: naturally irresistible your corporate...     1
1  Subject: the stock trading gunslinger  fanny i...     1
2  Subject: unbelievable new homes made easy  im ...     1
3  Subject: 4 color printing special  request add...     1
4  Subject: do not have money , get software cds ...     1


## Encoding text values

In [6]:
#label
#spam=1
#ham=0


## Seperating text and labels

In [7]:
x=mail_data['text']
y=mail_data['spam']

In [8]:
print(x)

0       Subject: naturally irresistible your corporate...
1       Subject: the stock trading gunslinger  fanny i...
2       Subject: unbelievable new homes made easy  im ...
3       Subject: 4 color printing special  request add...
4       Subject: do not have money , get software cds ...
                              ...                        
5723    Subject: re : research and development charges...
5724    Subject: re : receipts from visit  jim ,  than...
5725    Subject: re : enron case study update  wow ! a...
5726    Subject: re : interest  david ,  please , call...
5727    Subject: news : aurora 5 . 2 update  aurora ve...
Name: text, Length: 5728, dtype: object


In [9]:
print(y)

0       1
1       1
2       1
3       1
4       1
       ..
5723    0
5724    0
5725    0
5726    0
5727    0
Name: spam, Length: 5728, dtype: int64


## splitting  the data into training data and test data

In [10]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=6)

In [11]:
print(x.shape)
print(x_test.shape)
print(x_train.shape)

(5728,)
(1146,)
(4582,)


# Feature Extraction
### converting the text data into meaningful numerical values

In [15]:
# transform the text data into feature vectors that can be used as an input to logistic regression model
# text data--------------->numerical values


# specifying the function
feature_extraction=TfidfVectorizer(min_df=1,stop_words='english',lowercase='True ')

In [17]:
# feature extraction on x_train data
xtrainfeatures=feature_extraction.fit_transform(x_train)

# no need to fit for the test data
xtestfeatures=feature_extraction.transform(x_test)

# converting y_train and y_test values into integers
# because in the spam column,the 0's and 1's aren't considered as integers (considered as object ).thet's why
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [18]:
print(xtrainfeatures)

  (0, 29777)	0.22265425280051765
  (0, 26037)	0.1486157919714039
  (0, 31364)	0.20292702850024014
  (0, 29294)	0.37121241734715393
  (0, 7510)	0.16906110429987345
  (0, 26904)	0.14918305788341485
  (0, 23967)	0.23988148631549075
  (0, 5073)	0.12973083509766364
  (0, 32354)	0.12172030973099203
  (0, 11850)	0.21291669869536686
  (0, 29268)	0.15263149509575735
  (0, 30006)	0.17943098309242866
  (0, 14167)	0.2639003052051949
  (0, 828)	0.09218049169968094
  (0, 17474)	0.17795339222238116
  (0, 26823)	0.40831727791272804
  (0, 29221)	0.4797629726309815
  (0, 29037)	0.04248758566514555
  (1, 10895)	0.14332209200727053
  (1, 8121)	0.15788089767682223
  (1, 32870)	0.15531650495844082
  (1, 18580)	0.07928491830158402
  (1, 10886)	0.12512539205757484
  (1, 3691)	0.2502828754322658
  (1, 21476)	0.13023645703865327
  :	:
  (4580, 10895)	0.06243857612641824
  (4580, 828)	0.032706677182529315
  (4580, 29037)	0.01507507416148644
  (4581, 20765)	0.35227425420608766
  (4581, 7602)	0.24604837286556586
 

# Logistic Regression

In [19]:
model=LogisticRegression()

In [20]:
# training the logistic regression model with the training data
model.fit(xtrainfeatures,y_train)

LogisticRegression()

# Evaluating the trained model

In [22]:
# prediction on training data

prediction=model.predict(xtrainfeatures)
accuracy_on_trainingdata=accuracy_score(y_train,prediction)
print("Accuracy on training data =",accuracy_on_trainingdata)

Accuracy on training data = 0.9965080750763858


In [27]:
# prediction on test data

prediction=model.predict(xtestfeatures)
accuracy_on_testdata=accuracy_score(y_test,prediction)
print("Accuracy on testing data =",accuracy_on_testdata)

Accuracy on testing data = 0.9764397905759162


# Building a predictive system

In [26]:
input=["Subject: here ' s a hot play in motion  homeland security investments  the terror attacks on the united states on september 11 , 20 ol have  changed  the security landscape for the foreseeable future . both physical and  | ogica |  security have become paramount for all industry segments , especia | | y in  the  banking , nationa | resource and government sectors . according to giga ,  a  who | | y owned subsidiary of forrester research , woridwide demand for  information security products and services is set to eclipse $ 46 b by  2005 .  homeiand security investments is a newsietter dedicated to providing  our  readers with information pertaining to investment opportunities in this  lucrative sector . as we know , events related to homeland security  happen  with lightning speed . what we as investors can do is position  ourselves in  such a way as to take advantage of the current trends and be ready to  capitalize on events which have yet to happen . homeland security  investments is here to heip our readers do just that .  with this in mind , it is with great excitement that we present vinoble ,  inc .  this stock is expected to do big things in both the near and | ong  terms .  symbol : vnbl . ob  current price : o . 08  short term target price : o . 35  12 month target price : 1 . 20  * * * why we believe vnbl . ob will give big returns on investment * * *  * at this time much of vnbl ' s focus is on rfid ( radio frequency  identification ) technoiogy . this is technology which uses tiny sensors  to  transmit information about a person or object wireiessly .  * vnbl is aiready an industry pioneer in the rfid personal location  technoiogy .  * vnbl is developing a form of rfid technology which allows companies  and  governments to wirelessly track their assets and resources . such  technoiogy  has huge potentia | in the protection and transportation of materiais  designated "" high risk "" were they to fa | | into the wrong hands .  * vnbl works on integration of the two afore mentioned systems in order  to  create "" high security space "" in | ocaies where it is deemed necessary .  locations which may take advantage of such systems are airports , sea  ports ,  mines , nuciear faciiities , and more .  * as with a | | stocks , news drives the short term price . fresh news has  made vnbl a hot buy .  news on vnbl  malibu , calif . - - ( business wire ) - - june 16 , 2 oo 5 - - vinoble , inc .  ( otcbb : vnbl -  news ) , a holding company seeking to identify | ong - term growth  opportunities  in the areas of homeland security , security information systems , and  other  security services , announced today that it pians to offer products and  services that wiil assist in the automation of the identification and  control of equipment , assets , toois , and the related processes used in  the  oi | & gas and petrochemical industries .  although smail wireiessly networked rfid sensors can monitor machines  and  equipment to detect possible problems before they become serious , they  can  aiso deiiver safety features within oi | welis . oi | maybe trapped in  different | ayers of rock , aiong with gas and water . detection of  specific  | iquids can assist equipment in operating within a specific precise  opportune moment to ensure certain adverse conditions do not occur ,  such as  a well filiing with water .  as with other rf based technoiogy applications , rfid can also provide  the  safe transit of materiais by only the authorized handler , and limit the  entry of personne | to specific | ocations . ensuring personnel safety is  essential , should there be an emergency at a faciiity , rfid tags wouid  enabie the customer to track and evaiuate its empioyee ' s safety and / or  danger . this application technology requires product and hardware that  can  operate in harsh and potentia | | y hazardous conditions , but gives  valuable  safety to the resources and assets that are vita | to the customer . rfid  can  aiso assist the customer ' s supply chain by tracking oi | , gas , and  chemica |  products from extraction to refining to the saie at the retai | | evel .  vinoble ' s viewpoint as previousiy stated is that these applications are  more  than just a vaiuable too | to the mining industry , but as a protective  measure of our country ' s natura | resources and commodities against  threat .  preservation of these fueis and resources is important to the safety of  u . s .  industry and economy .  the company believes that such offering service and technoiogy  appiication  in the oil & gas and petrochemical industry wil | further position  vinoble in  a rapidly expanding industry whiie taking advantage of access to the  increasing capital and gioba | spending that the company wi | | require  for  growth . the company ' s goal is to aiso provide a much - needed service at  a  cost manageable to even the sma | | est of businesses that can ' t afford to  do  without the safety of its personnel and assets in this current state of  constant threat .  this is outstanding news . the growth potential for this company is  exceptional . in an already hot industry , vnbl . ob stands out as a truiy  innovative pioneer . we see big things happening to this stock .  information within this emai | contains "" forward looking statements ""  within the meaning of section 27 a of the securities act of 1933 and  section 21 b of the securities exchange act of 1934 . any statements that  express or involve discussions with respect to predictions ,  expectations , beliefs , pians , projections , objectives , goals ,  assumptions or  future  events or performance are not statements of historica | fact and may be  "" forward | ooking statements . "" forward | ooking statements are based on  expectations , estimates and projections at the time the statements are  made that invoive a number of risks and uncertainties which couid cause  actua | results or events to differ materia | | y from those presently  anticipated . forward looking statements in this action may be  identified  through the use of words such as "" projects "" , "" foresee "" , "" expects "" ,  "" wi | | , "" "" anticipates , "" "" estimates , "" "" beiieves , "" "" understands "" or  that by  statements indicating certain actions "" may , "" "" couid , "" or "" might "" occur .  as with many micro - cap stocks , today ' s company has additional risk  factors worth noting . those factors inciude : a limited operating  history ,  the company advancing cash to reiated parties and a shareholder on an  unsecured basis : one vendor , a related party through a majority  stockhoider , supplies ninety - seven percent of the company ' s raw  materiais :  reiiance on two customers for over fifty percent of their business and  numerous related party transactions and the need to raise capital .  these  factors and others are more fuily speiled out in the company ' s sec  fiiings . we urge you to read the filings before you invest . the rocket  stock  report does not represent that the information contained in this  message states ail materia | facts or does not omit a material fact  necessary  to make the statements therein not misleading . ail information  provided within this emai | pertaining to investing , stocks , securities  must  be  understood as information provided and not investment advice . the  rocket stock report advises all readers and subscribers to seek advice  from  a registered professiona | securities representative before deciding to  trade in stocks featured within this email . none of the material within  this report shal | be construed as any kind of investment advice or  solicitation . many of these companies are on the verge of bankruptcy .  you  can lose ail your money by investing in this stock . the publisher of  the rocket stock report is not a registered investment advisor .  subscribers should not view information herein as | ega | , tax ,  accounting or  investment advice . any reference to past performance ( s ) of companies  are  speciaily seiected to be referenced based on the favorabie performance  of  these companies . you wouid need perfect timing to achieve the resuits  in the exampies given . there can be no assurance of that happening .  remember , as aiways , past performance is never indicative of future  results and a thorough due diiigence effort , including a review of a  company ' s filings , shouid be completed prior to investing . in  compiiance  with the securities act of 1933 , section 17 ( b ) , the rocket stock report  discioses the receipt of tweive thousand doilars from a third party  ( gem , inc . ) , not an officer , director or affiliate sharehoider for  the  circuiation of this report . gem , inc . has a position in the stock  they  wil | se | | at any time without notice . be aware of an inherent confiict  of interest resuiting from such compensation due to the fact that this  is a paid advertisement and we are conflicted . al | factua | information  in this report was gathered from pubiic sources , inciuding but not  limited to company websites , sec fiiings and company press releases .  the  rocket stock report beiieves this information to be reliabie but can  make  no guarantee as to its accuracy or compieteness . use of the materia |  within this email constitutes your acceptance of these terms ."]

# feature extraction of input data
fe_sample=feature_extraction.transform(input)

#making predictions
prediction=model.predict(fe_sample)
if(prediction==1):
    print("Spam mail")
else:
    print("Ham mail")

Spam mail
