Permalink
Browse files

Added mail-parser library.

New folders names.
Fixed tests
  • Loading branch information...
1 parent b30a7a2 commit 7981bc73c4579c079d550dd9efd5f333593cf6af @fedelemantuano fedelemantuano committed Sep 11, 2016
View
@@ -6,6 +6,4 @@
_build
_resources
logs
-unittest/mails/
-unittest/samples/
virtualenvs
View
@@ -1,4 +1,4 @@
-<p align="center"><img src="doc/logo/spamscope.jpg"/></p>
+<p align="center"><img src="docs/logo/spamscope.jpg"/></p>
## Overview
View
@@ -1,5 +0,0 @@
-from __future__ import absolute_import
-
-VERSION = (0, 7, 0)
-__version__ = VERSION
-__versionstr__ = 'v' + '.'.join(map(str, VERSION))
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes.
File renamed without changes
View
@@ -6,6 +6,7 @@ Fabric==1.11.1
invoke==0.12.2
Jinja2==2.8
lxml==3.6.0
+mail-parser==0.1.0
MarkupSafe==0.23
paramiko==1.17.0
patool==1.12
@@ -21,6 +22,6 @@ simplejson==3.8.2
six==1.10.0
ssdeep==3.1.1
streamparse==2.1.4
-tika-app==0.3
+tika-app==0.4.0
urllib3==1.16
virustotal-api==1.1.2
@@ -1,288 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-"""
-Copyright 2016 Fedele Mantuano (https://twitter.com/fedelemantuano)
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
-
-from __future__ import unicode_literals
-from email.errors import HeaderParseError
-from email.header import decode_header
-import datetime
-import email
-import logging
-import time
-
-try:
- import simplejson as json
-except ImportError:
- import json
-
-log = logging.getLogger(__name__)
-
-
-class InvalidMail(ValueError):
- pass
-
-
-class NotUnicodeError(ValueError):
- pass
-
-
-class MailParser(object):
-
- """Class to parse mail. """
-
- def parse_from_file(self, fd):
- with open(fd) as mail:
- self._message = email.message_from_file(mail)
- self._parse()
-
- def parse_from_string(self, s):
- self._message = email.message_from_string(s)
- self._parse()
-
- def _decode_header_part(self, header):
- output = u''
-
- try:
- for i in decode_header(header):
- if i[1]:
- output += unicode(i[0], i[1], errors='ignore').strip()
- else:
- output += unicode(i[0], errors='ignore').strip()
-
- # Header parsing failed, when header has charset Shift_JIS
- except HeaderParseError:
- log.error("Failed decoding header part: {}".format(header))
- output += header
-
- if not isinstance(output, unicode):
- raise NotUnicodeError("Header part is not unicode")
-
- return output
-
- def _force_unicode(self, s):
- try:
- u = unicode(
- s,
- encoding=self.charset,
- errors='ignore',
- )
- except:
- u = unicode(
- s,
- errors='ignore',
- )
-
- if not isinstance(u, unicode):
- raise NotUnicodeError("Body part is not unicode")
-
- return u
-
- def _parse(self):
- if not self._message.keys():
- raise InvalidMail(
- "Mail without headers: {}".format(self._message.as_string())
- )
-
- self._attachments = list()
- self._text_plain = list()
- self._defects = list()
- self._has_defects = False
- self._has_anomalies = False
- self._anomalies = list()
-
- # walk all mail parts
- for p in self._message.walk():
- part_content_type = p.get_content_type()
-
- # Get all part defects
- part_defects = {part_content_type: list()}
-
- for e in p.defects:
- part_defects[part_content_type].append(
- "{}: {}".format(e.__class__.__name__, e.__doc__)
- )
-
- # Tag mail with defect
- if part_defects[part_content_type]:
- self._has_defects = True
-
- # Save all defects
- self._defects.append(part_defects)
-
- if not p.is_multipart():
- f = p.get_filename()
- if f:
- filename = self._decode_header_part(f)
- mail_content_type = self._decode_header_part(
- p.get_content_type(),
- )
- transfer_encoding = \
- unicode(p.get('content-transfer-encoding', '')).lower()
-
- if transfer_encoding == "base64":
- payload = p.get_payload(decode=False)
- else:
- payload = self._force_unicode(
- p.get_payload(decode=True),
- )
-
- self._attachments.append(
- {
- "filename": filename,
- "payload": payload,
- "mail_content_type": mail_content_type,
- "content_transfer_encoding": transfer_encoding,
- }
- )
- else:
- payload = self._force_unicode(
- p.get_payload(decode=True),
- )
- self._text_plain.append(payload)
-
- # Parsed object mail
- self._mail = {
- "attachments": self.attachments_list,
- "body": self.body,
- "date": self.date_mail,
- "from": self.from_,
- "headers": self.headers,
- "message_id": self.message_id,
- "subject": self.subject,
- "to": self.to_,
- "charset": self.charset,
- "has_defects": self._has_defects,
- "has_anomalies": self._has_anomalies,
- }
-
- # Add defects
- if self.has_defects:
- self._mail["defects"] = self.defects
-
- # Add anomalies
- if self.has_anomalies:
- self._mail["anomalies"] = self.anomalies
- self._mail["has_anomalies"] = True
-
- @property
- def body(self):
- return "\n".join(self.text_plain_list)
-
- @property
- def headers(self):
- s = ""
- for k, v in self._message.items():
- v_u = self._decode_header_part(v)
- s += k + " " + v_u + "\n"
- return s
-
- @property
- def message_id(self):
- message_id = self._message.get('message-id', None)
- if not message_id:
- self._anomalies.append('mail_without_message-id')
- return None
- else:
- return self._decode_header_part(message_id)
-
- @property
- def to_(self):
- return self._decode_header_part(
- self._message.get('to', self._message.get('delivered-to'))
- )
-
- @property
- def from_(self):
- return self._decode_header_part(
- self._message.get('from')
- )
-
- @property
- def subject(self):
- return self._decode_header_part(
- self._message.get('subject')
- )
-
- @property
- def text_plain_list(self):
- return self._text_plain
-
- @property
- def attachments_list(self):
- return self._attachments
-
- @property
- def charset(self):
- return self._message.get_content_charset('utf-8')
-
- @property
- def date_mail(self):
- date_ = self._message.get('date')
-
- if not date_:
- self._anomalies.append('mail_without_date')
- return None
-
- try:
- d = email.utils.parsedate(date_)
- t = time.mktime(d)
- return datetime.datetime.utcfromtimestamp(t)
- except:
- return None
-
- @property
- def parsed_mail_obj(self):
- return self._mail
-
- @property
- def parsed_mail_json(self):
- self._mail["date"] = self.date_mail.isoformat() \
- if self.date_mail else ""
- return json.dumps(
- self._mail,
- ensure_ascii=False,
- indent=None,
- )
-
- @property
- def defects(self):
- """The defects property contains a list of
- all the problems found when parsing this message.
- """
- return self._defects
-
- @property
- def has_defects(self):
- """Boolean: True if mail has defects. """
- return self._has_defects
-
- @property
- def anomalies(self):
- """The anomalies property contains a list of
- all anomalies in mail:
- - mail_without_date
- - mail_without_message-id
- """
- return self._anomalies
-
- @property
- def has_anomalies(self):
- if self.anomalies:
- return True
- else:
- return False
@@ -25,7 +25,7 @@
import shutil
import ssdeep
import tempfile
-from tika_app.tika_app import TikaApp
+from tikapp import TikaApp
from virus_total_apis import PublicApi as VirusTotalPublicApi
log = logging.getLogger(__name__)
@@ -342,8 +342,8 @@ def parse_sample(
self,
data,
filename,
- mail_content_type,
- transfer_encoding,
+ mail_content_type=None,
+ transfer_encoding=None,
):
"""Analyze sample and add metadata.
If it's an archive, extract it and put files in a list of dictionaries.
@@ -381,8 +381,8 @@ def parse_sample_from_base64(
self,
data,
filename,
- mail_content_type,
- transfer_encoding,
+ mail_content_type=None,
+ transfer_encoding=None,
):
"""Analyze sample and add metadata.
If it's a archive, extract it and put files in a list of dictionaries.
File renamed without changes.
File renamed without changes.
No changes.
View
Binary file not shown.
@@ -0,0 +1,2 @@
+test1
+http://www.google.it/
Binary file not shown.
File renamed without changes.
@@ -20,13 +20,13 @@
import os
import sys
import unittest
-from tika_app import tika_app as tika
+import tikapp as tika
base_path = os.path.realpath(os.path.dirname(__file__))
root = os.path.join(base_path, '..')
-sample_zip = os.path.join(root, 'unittest', 'samples', 'test.zip')
-sample_zip_1 = os.path.join(root, 'unittest', 'samples', 'test1.zip')
-sample_txt = os.path.join(root, 'unittest', 'samples', 'test.txt')
+sample_zip = os.path.join(base_path, 'samples', 'test.zip')
+sample_zip_1 = os.path.join(base_path, 'samples', 'test1.zip')
+sample_txt = os.path.join(base_path, 'samples', 'test.txt')
sys.path.append(root)
import src.modules.sample_parser as sample_parser
File renamed without changes.
@@ -27,7 +27,7 @@
import src.modules.utils as utils
-text_files = os.path.join(root, 'profiling', 'lorem_ipsum.txt')
+text_files = os.path.join(base_path, 'samples', 'lorem_ipsum.txt')
class TestSearchText(unittest.TestCase):
@@ -135,7 +135,7 @@
t.append(re.compile(r'(\b%s\b)' % i, re.I))
keywords_re_compiled.append(t)
-with open("lorem_ipsum.txt") as f:
+with open("samples/lorem_ipsum.txt") as f:
text = f.read()
Oops, something went wrong.

0 comments on commit 7981bc7

Please sign in to comment.