# SFTP to S3

Setting up SFTP server on **Windows_Server-2016-English-Full-Base**

1. Add new Inbound Rules to Security Groups
<table>
  <tr>
    <th>Port Range</th>
    <th>Source</th>
  </tr>
  <tr>
    <td>20-21</td>
    <td>Anywhere</td>
  </tr>
  <tr>
    <td>22</td>
    <td>Anywhere</td>
  </tr>
    <tr>
    <td>50000-51000</td>
    <td>Anywhere</td>
  </tr>
    <tr>
    <td>3369 & 3389</td>
    <td>Anywhere</td>
  </tr>
</table>

2. Download CopSSH and [Configure it](https://www.youtube.com/watch?v=aHKatBGrKbI).


In [78]:
import paramiko
import math
import boto3
# Get the service client 
client = boto3.client('s3')

import pprint
# prints the formatted representation of PrettyPrinter object
pp = pprint.PrettyPrinter(indent=4)

## Connect to SFTP
- We can create SFTPClient object connected to a computer on which remote file operations can be performed in two ways :-
    1. Paramiko Transport object to establish a connection to the (remote) computer and then create the SFTClient object using the Transport object
    2. Create a Paramiko SSHClient object which is then used to open a SFTP connection and obtain a SFTPClient object.

In [46]:
host = "ec2-15-206-125-2.ap-south-1.compute.amazonaws.com"
port = 22
user = "test_user"
pss = "Rishabh@123"
keyfilepath = None
bucket_name = "rishabhsengar2611"

### Paramiko Transport object

In [47]:
# If key is provided, then add the key
if keyfilepath is not None:
# Get private key used to authenticate user.
    if keyfiletype == 'DSA':
# The private key is a DSA type key.
        key = paramiko.DSSKey.from_private_key_file(keyfilepath)
    else:
# The private key is a RSA type key.
        key = paramiko.RSAKey.from_private_key(keyfilepath)

    
# Create Transport object using supplied method of authentication.
transport = paramiko.Transport(host, port)
# add key attribute if provided
transport.connect( username = user, password = pss)
sftp = paramiko.SFTPClient.from_transport(transport)

# # Close SFTP
# sftp.close()
# # Close tansport
# transport.close()

### Paramiko SSHClient object

In [48]:
# # If key is provided, then add the key
# if keyfilepath is not None:
# # Get private key used to authenticate user.
#     if keyfiletype == 'DSA':
# # The private key is a DSA type key.
#         key = paramiko.DSSKey.from_private_key_file(keyfilepath)
#     else:
# # The private key is a RSA type key.
#         key = paramiko.RSAKey.from_private_key(keyfilepath)

# # Connect SSH client accepting all host keys.
# ssh = paramiko.SSHClient()
# ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
# # add key attribute if provided
# ssh.connect(host, port, username = user, password = pss)

# # Using the SSH client, create a SFTP client.
# sftp = ssh.open_sftp()
# # Keep a reference to the SSH client in the SFTP client as to prevent the former from
# # being garbage collected and the connection from being closed.
# sftp.sshclient = ssh

# # Close SFTP
# sftp.close()
# # Close SSH
# ssh.close()

## Some Manipulations

### Retriving list of files

In [49]:
# List files in the directory on the remote computer.
dirlist = sftp.listdir('./ftp_files')
for row in dirlist:
    print(row)

9G9VYW23CU.csv
Big-Data-Landscape-2017.pdf
Big-Data-Landscape-2018.pdf
Big-Data-Landscape-2019.pdf
D029LRLIRA.csv
DN7A49XY69.csv
EDTMQD3VFB.csv
file.txt
GJNZEL7QS7.csv
Measurement_info.csv
Measurement_summary.csv
MOCK_DATA.json
N84NQPAZ5A.csv
people.json
SG75B3AMDD.csv
WMM6GGSTIQ.csv


### Get only CSV files 

In [50]:
def filter_names(n) :
    if n.endswith('csv') :
        return True
    else :
        return False

ftp_files = list(filter( filter_names, dirlist ))

# Printing
pp.pprint(ftp_files)

[   '9G9VYW23CU.csv',
    'D029LRLIRA.csv',
    'DN7A49XY69.csv',
    'EDTMQD3VFB.csv',
    'GJNZEL7QS7.csv',
    'Measurement_info.csv',
    'Measurement_summary.csv',
    'N84NQPAZ5A.csv',
    'SG75B3AMDD.csv',
    'WMM6GGSTIQ.csv']


### Files which are already present in S3

In [51]:
try :
    # Returns some or all of the objects in a bucket
    s3_files = client.list_objects_v2(Bucket = 'rishabhsengar2611')
except :
    # if wrong bucket name is entered
    print("No such Bucket \n")

pp.pprint(s3_files)

{   'Contents': [   {   'ETag': '"f903d3cf0c7f126fa9f939c3e07ee66d"',
                        'Key': 'EDTMQD3VFB.csv',
                        'LastModified': datetime.datetime(2020, 4, 15, 11, 49, 14, tzinfo=tzutc()),
                        'Size': 237,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"a17ae887587039441d68f8d3ea1eeeb6"',
                        'Key': 'GJNZEL7QS7.csv',
                        'LastModified': datetime.datetime(2020, 4, 15, 11, 49, 14, tzinfo=tzutc()),
                        'Size': 316,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"a111385963ba51827e8a407dcac1e867"',
                        'Key': 'MOCK_DATA.json',
                        'LastModified': datetime.datetime(2020, 4, 15, 11, 47, 45, tzinfo=tzutc()),
                        'Size': 287,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"90782d0821d06338747f7927e53e3d00"',
   

In [52]:
# Dictionary Access to get the name of the first file
s3_files['Contents'][0]['Key']

'EDTMQD3VFB.csv'

In [53]:
# Dictionary Access to get the size of the first file
s3_files['Contents'][0]['Size']

237

In [54]:
### Files which are already present in S3

try :
    # Returns some or all of the objects in a bucket
    s3_files = client.list_objects_v2(Bucket = 'rishabhsengar2611')
except :
    # if wrong bucket name is entered
    print("No such Bucket \n")

pp.pprint(s3_files)

# Dictionary Access to get the name of the first file
s3_files['Contents'][0]['Key']

# Dictionary Access to get the size of the first file
s3_files['Contents'][0]['Size']

{   'Contents': [   {   'ETag': '"f903d3cf0c7f126fa9f939c3e07ee66d"',
                        'Key': 'EDTMQD3VFB.csv',
                        'LastModified': datetime.datetime(2020, 4, 15, 11, 49, 14, tzinfo=tzutc()),
                        'Size': 237,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"a17ae887587039441d68f8d3ea1eeeb6"',
                        'Key': 'GJNZEL7QS7.csv',
                        'LastModified': datetime.datetime(2020, 4, 15, 11, 49, 14, tzinfo=tzutc()),
                        'Size': 316,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"a111385963ba51827e8a407dcac1e867"',
                        'Key': 'MOCK_DATA.json',
                        'LastModified': datetime.datetime(2020, 4, 15, 11, 47, 45, tzinfo=tzutc()),
                        'Size': 287,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"90782d0821d06338747f7927e53e3d00"',
   

237

### Get S3 file names and there size

In [55]:
# Used to store the s3 files names
s3_files_names = []

# Used to store the sizes of files
s3_files_size = []

print("File Name \t\t Size\n")
# Print the files
try :
    for i in range( 0,len(s3_files['Contents']) ) :
        s3_files_names.append(s3_files['Contents'][i]['Key'])
        print(s3_files['Contents'][i]['Key'], end='\t\t')
        s3_files_size.append(s3_files['Contents'][i]['Size'])
        print(s3_files['Contents'][i]['Size'], end='\n')
        
except KeyError :
    print("Bucket is Empty")


File Name 		 Size

EDTMQD3VFB.csv		237
GJNZEL7QS7.csv		316
MOCK_DATA.json		287
SG75B3AMDD.csv		387
WMM6GGSTIQ.csv		277
file.txt		319
people.json		1612897


### Remove already present file in the s3

In [56]:
# Already present files
s3_files_present = []

def filter_names(n) :
    if n in s3_files_names :
        s3_files_present.append(n)
        return False
    else :
        return True

ftp_files = list(filter( filter_names, ftp_files ))


print(" To be Added")
pp.pprint(ftp_files)
print("\n Already Present")
pp.pprint(s3_files_present)

 To be Added
[   '9G9VYW23CU.csv',
    'D029LRLIRA.csv',
    'DN7A49XY69.csv',
    'Measurement_info.csv',
    'Measurement_summary.csv',
    'N84NQPAZ5A.csv']

 Already Present
['EDTMQD3VFB.csv', 'GJNZEL7QS7.csv', 'SG75B3AMDD.csv', 'WMM6GGSTIQ.csv']


In [57]:
# Make the Dictionary of filename and Size of the files present in FTP 
ftp_size = {}
for i in ftp_files :
# stat(path) ---> Retrieve information about a file on the remote system
    destination = './ftp_files/' + i
    info = sftp.stat(destination)
    ftp_size[i] = info.st_size

pp.pprint(ftp_size)

{   '9G9VYW23CU.csv': 354,
    'D029LRLIRA.csv': 341,
    'DN7A49XY69.csv': 467,
    'Measurement_info.csv': 124452984,
    'Measurement_summary.csv': 94076158,
    'N84NQPAZ5A.csv': 429}


## Adding files to S3

### Files which have size less than 6MB uploaded

In [59]:
ftp_file_path = './ftp_files/' + list(ftp_size.keys())[0]
print(ftp_file_path)

ftp_file = sftp.file(ftp_file_path, 'r')

'./ftp_files/9G9VYW23CU.csv'

In [61]:
ftp_file_data = ftp_file.read()

In [64]:
ftp_file = sftp.file(ftp_file_path, 'r')
if list(ftp_size.values())[0] <= int(chunk_size): 
    #upload file in one go 
    print('Transferring complete File from FTP to S3...')
#     ftp_file_data = ftp_file.read()
    client.upload_fileobj(Fileobj=ftp_file ,Bucket = bucket_name ,Key = list(ftp_size.keys())[0]) 
    print('Successfully Transferred file from FTP to S3!') 
    ftp_file.close()

Transferring complete File from FTP to S3...
Successfully Transferred file from FTP to S3!


### Files to be uploaded in chunks

In [66]:
# Each part must be at least 5 MB in size
# Since AWS won't allow us to have size less than 5MB
# 1024*1024*6 == 6MB
chunk_size = 6291456

In [74]:
# Used to store the large files names
big_files = []

# This for loop is used to upload files
for i in ftp_size :
    if ftp_size[i] > chunk_size :
        big_files.append(i)
        
# Big files are :-        
big_files

Measurement_info.csv
Measurement_summary.csv


['Measurement_info.csv', 'Measurement_summary.csv']

In [76]:
ftp_size[big_files[0]]

124452984

In [79]:
chunk_count = int(math.ceil(ftp_size[big_files[0]] / float(chunk_size)))
chunk_count

20

In [80]:
ftp_file_path = './ftp_files/' + big_files[0]
print(ftp_file_path)
ftp_file = sftp.file(ftp_file_path, 'r')


./ftp_files/Measurement_info.csv


In [88]:
multipart_upload = client.create_multipart_upload(Bucket = bucket_name, Key = big_files[0])
parts = []

ftp_file_path = './ftp_files/' + big_files[0]
print(ftp_file_path)
ftp_file = sftp.file(ftp_file_path, 'r')

for  i in range(chunk_count):
    print('Transferring chunk {}...'.format(i + 1))
    part_number = i+1
    
    chunk = ftp_file.read(int(chunk_size))
    
    part = client.upload_part(
        Bucket = bucket_name,
        Key = big_files[0],
        PartNumber = part_number,
        UploadId = multipart_upload['UploadId'],
        Body = chunk
        )
    
    part_output = {'PartNumber': part_number,'ETag': part['ETag'] }
                   
    parts.append(part)
    print('Chunk {} Transferred Successfully!'.format(i + 1))

part_info = { 'Parts': parts }
client.complete_multipart_upload(
            Bucket = bucket_name,
            Key = big_files[0],
            UploadId = multipart_upload['UploadId'],
            MultipartUpload = part_info
            )
print('All chunks Transferred to S3 bucket! File Transfer successful!')

./ftp_files/Measurement_info.csv
Transferring chunk 1...
Chunk 1 Transferred Successfully!
Transferring chunk 2...


Socket exception: Connection reset by peer (104)


SSHException: Server connection dropped: 

# FTP to S3

## Libraries Used

In [3]:
from ftplib import FTP
import string
import os
import time
import math

import pprint
# prints the formatted representation of PrettyPrinter object
pp = pprint.PrettyPrinter(indent=4)


import boto3
# Get the service client 
client = boto3.client('s3')


## Setting FTP Connection

In [4]:
# from ftplib import FTP

# Domain name or server ip:
ftp = FTP('ec2-52-66-211-38.ap-south-1.compute.amazonaws.com')
    
ftp.login(user='test_user', passwd = 'rishabh')

TimeoutError: [Errno 110] Connection timed out

### Get all the file names

In [None]:
# Set the current directory on the server
ftp.cwd('/files/ftp')

# Return a list of file names
ftp_files = ftp.nlst()

# Printing
pp.pprint(ftp_files)

### Get only CSV files 

In [44]:
def filter_names(n) :
    if n.endswith('csv') :
        return True
    else :
        return False

ftp_files = list(filter( filter_names, ftp_files ))

# Printing
pp.pprint(ftp_files)

[   '9G9VYW23CU.csv',
    'D029LRLIRA.csv',
    'DN7A49XY69.csv',
    'EDTMQD3VFB.csv',
    'GJNZEL7QS7.csv',
    'Measurement_info.csv',
    'Measurement_summary.csv',
    'N84NQPAZ5A.csv',
    'SG75B3AMDD.csv',
    'WMM6GGSTIQ.csv']


### Files which are already present in S3

In [45]:
try :
    # Returns some or all of the objects in a bucket
    s3_files = client.list_objects_v2(Bucket = 'rishabhsengar2611')
except :
    # if wrong bucket name is entered
    print("No such Bucket \n")

pp.pprint(s3_files)

{   'Contents': [   {   'ETag': '"a0ad38f3a09a0d0d0c2dd905736e6986"',
                        'Key': '9G9VYW23CU.csv',
                        'LastModified': datetime.datetime(2020, 4, 15, 11, 49, 13, tzinfo=tzutc()),
                        'Size': 354,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"571d1690191e982850f6e593049ff428"',
                        'Key': 'D029LRLIRA.csv',
                        'LastModified': datetime.datetime(2020, 4, 15, 11, 49, 13, tzinfo=tzutc()),
                        'Size': 341,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"8d2a79db85621494aa0c00583b0058cc"',
                        'Key': 'DN7A49XY69.csv',
                        'LastModified': datetime.datetime(2020, 4, 15, 11, 49, 14, tzinfo=tzutc()),
                        'Size': 467,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"f903d3cf0c7f126fa9f939c3e07ee66d"',
   

In [46]:
# Dictionary Access to get the name of the first file
s3_files['Contents'][0]['Key']

'9G9VYW23CU.csv'

In [47]:
# Dictionary Access to get the size of the first file
s3_files['Contents'][0]['Size']

354

### Get S3 file names and there size

In [48]:
# Used to store the s3 files names
s3_files_names = []

# Used to store the sizes of files
s3_files_size = []

print("File Name \t\t Size\n")
# Print the files
try :
    for i in range( 0,len(s3_files['Contents']) ) :
        s3_files_names.append(s3_files['Contents'][i]['Key'])
        print(s3_files['Contents'][i]['Key'], end='\t\t')
        s3_files_size.append(s3_files['Contents'][i]['Size'])
        print(s3_files['Contents'][i]['Size'], end='\n')
        
except KeyError :
    print("Bucket is Empty")


File Name 		 Size

9G9VYW23CU.csv		354
D029LRLIRA.csv		341
DN7A49XY69.csv		467
EDTMQD3VFB.csv		237
GJNZEL7QS7.csv		316
MOCK_DATA.json		287
N84NQPAZ5A.csv		429
SG75B3AMDD.csv		387
WMM6GGSTIQ.csv		277
file.txt		319
people.json		1612897


### Remove already present file in the s3

In [49]:
# Already present files
s3_files_present = []

def filter_names(n) :
    if n in s3_files_names :
        s3_files_present.append(n)
        return False
    else :
        return True

ftp_files = list(filter( filter_names, ftp_files ))


print(" To be Added")
pp.pprint(ftp_files)
print("\n Already Present")
pp.pprint(s3_files_present)

 To be Added
['Measurement_info.csv', 'Measurement_summary.csv']

 Already Present
[   '9G9VYW23CU.csv',
    'D029LRLIRA.csv',
    'DN7A49XY69.csv',
    'EDTMQD3VFB.csv',
    'GJNZEL7QS7.csv',
    'N84NQPAZ5A.csv',
    'SG75B3AMDD.csv',
    'WMM6GGSTIQ.csv']


In [50]:
# Make the Dictionary of filename and Size of the files present in FTP 
ftp_size = {}
for i in ftp_files :
# FTP.size(filename) ---- >Request the size of the file named filename on the server.
    ftp_size[i] = ftp.size(i)

pp.pprint(ftp_size)

{'Measurement_info.csv': 124452984, 'Measurement_summary.csv': 94076158}


## Adding files to S3

In [51]:
# Each part must be at least 5 MB in size
# Since AWS won't allow us to have size less than 5MB
# 1024*1024*6 == 6MB
chunk_size = 6291456

In [52]:
# ftp.size(ftp_files[0])
for file in ftp_size :
    print(file+"\t\t"+str(ftp_size[file]))

Measurement_info.csv		124452984
Measurement_summary.csv		94076158


### Files which have size less than 6MB uploaded

In [53]:
# Used to store the large files
big_files = []

# This for loop is used to upload files
for file in ftp_size :
    if ftp_size[file] < chunk_size :
        
        # Change the location
        local_file = os.path.join('/home/bluepi/Desktop/',file)
        
        # Retrieve a file in binary transfer mode
        # RETR command --> A RETR request asks the server to send the contents of a file 
        #                       over the data connection already established by the client.
        ftp.retrbinary('RETR ' + file, open(local_file, 'wb').write)
        print("retieved file :\t" + file,end='\n' )
        
        # Upload file in binary mode in s3 object
        # Useful when we perform multipsrt upload
        with open(local_file, 'rb') as data:
            client.upload_fileobj(Fileobj=data, Bucket= 'rishabhsengar2611', Key= file)
            
        print("uploaded file :\t" + file,end='\n')
        #os.remove(file)
    else :
        big_files.append(file)
        print("big files :  "+file+"  -->  size :  "+str(ftp_size[file]),end = '\n')
        
# Errors :-
# Brokenpipeerror errno 32
# ftplib.error_perm: 530 Login authentication failed

big files :  Measurement_info.csv  -->  size :  124452984
big files :  Measurement_summary.csv  -->  size :  94076158


# FTP with PySpark

## Get the list of files present in FTP

In [14]:
# It allows us to write Python programs that perform a variety of automated FTP jobs

# from ftplib import FTP_TLS
# ftp = FTP_TLS()
# ftp.debugging = 2
# ftp.connect('192.168.1.7', 2121)
# ftp.login('test_user', 'rishabh')

# Return a new instance of the FTP class
# connect to host, default port
#ftp = ftplib.FTP_TLS(host="192.168.1.7",user="test_user",passwd="rishabh")
# Use "FTP_TLS" when FTP is over TLS


from ftplib import FTP

#domain name or server ip:
ftp = FTP('13.233.66.93')
ftp.login(user='test_user', passwd = 'rishabh')

# Log in as the given user
# Default user --> 'anonymous'
# Default password --> 'anonymous@'
# ftp.login(user="test_user",passwd="rishabh")

# Set the current directory on the server
print(ftp.cwd('/files/ftp'))

# Return a list of file names
lst = ftp.nlst()
print(lst,end='\n')  

# Request the size of the file
# f_size = ftp.
# print( str(f_size) + ' Bytes' )

# close the connection
ftp.quit()

250 CWD successful. "/files/ftp" is current directory.
['Big-Data-Landscape-2017.pdf', 'Big-Data-Landscape-2018.pdf', 'Big-Data-Landscape-2019.pdf', 'D029LRLIRA.csv', 'EDTMQD3VFB.csv', 'file.txt', 'MOCK_DATA.json', 'people.json', 'SG75B3AMDD.csv', 'WMM6GGSTIQ.csv']


'221 Goodbye'

## Add the file to be downloaded with this Spark job on every node.

In [3]:
# Main entry point for Spark functionality
from pyspark import SparkContext

# SparkFiles-- > Resolves paths to files added through "SparkContext.addFile"
from pyspark import SparkFiles

# Get or instantiate a SparkContext and register it as a singleton object
sc = SparkContext.getOrCreate()

# Basic structure of the FTP URL
# ftp://<user>:<password>@<host>:<port>/<url-path>
ftp_path = "ftp://test_user:rishabh@192.168.1.7"
# filename = "new_added.csv"

# Add a file to be downloaded with this Spark job on every node
# "lst" --> contains the file path in ftp
sc.addFile(ftp_path + lst[0])

# Get the absolute path of a file
absolute_path = SparkFiles.get(ftp_path + lst[1])

# Get the root directory that contains files
directory_path = SparkFiles.getRootDirectory()

## Get the path where file is present

In [6]:
import os
# import string
# rpartition method returns a 3-tuple containing:
#     *the part before the separator,
#     *separator
#     *part after the separator

path2 = os.path.join(directory_path, lst[0].rpartition('/')[-1])
# os.path.abspath(path2)
path2

'/tmp/spark-a3d5a473-ef64-48b0-99ae-f48c45dc07f1/userFiles-bb12823e-d0df-4190-9703-819434301c81/main_table.csv'

## Read the file

In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Basics').getOrCreate()

df = spark.read.format('csv') \
          .options( header=True, inferschema=False ) \
          .load(directory_path)
          

In [14]:
df.show(10)

+----+---------+-----+
|p_id|   p_name|price|
+----+---------+-----+
|   1|      Job| 1464|
|   2|   Keylex|  208|
|   3|   Duobam| 1684|
|   4|Ronstring| 1961|
|   5|  Bitwolf| 1338|
|   6|  Andalax|   22|
|   7|   Duobam| 1167|
|   8|    Alpha| 1573|
|   9|  Fix San| 1516|
|  10|   Biodex| 1916|
+----+---------+-----+
only showing top 10 rows



## Store in S3

In [9]:
import boto3

# create the s3 instance 
client = boto3.client('s3')

In [11]:
response = client.put_object(
    ACL='private',
    Body=b'df', # Object data
    Bucket='rishabhsengar2611', # Bucket name to which the PUT operation was initiated
    Key='new_added.csv' # Object key for which the PUT operation was initiated
)