# SFTP to S3

Setting up SFTP server on **Windows_Server-2016-English-Full-Base**

1. Add new Inbound Rules to Security Groups
<table>
  <tr>
    <th>Port Range</th>
    <th>Source</th>
    <th>Why</th>
  </tr>
  <tr>
    <td>20</td>
    <td>Anywhere</td>
      <th>For FTP(Data Channel)</th>
  </tr>
    <tr>
    <td>21</td>
    <td>Anywhere</td>
      <th>For FTP(Command Channel)</th>
  </tr>
  <tr>
    <td>22</td>
    <td>Anywhere</td>
      <th>For SFTP</th>
  </tr>
    <tr>
    <td>50000-51000</td>
    <td>Anywhere</td>
        <th>Passive Mode FTP</th>
  </tr>
    <tr>
    <td>3369 & 3389</td>
    <td>Anywhere</td>
        <th>For RDP</th>
  </tr>
</table>

2. Download CopSSH and [Configure it](https://www.youtube.com/watch?v=aHKatBGrKbI).


**Active FTP Mode**
- user connects from a random port on a file transfer client to FTP port 21 on the server
- The server connects from port 20 to the client port designated for the data channel
<img src="https://www.jscape.com/hs-fs/hub/26878/file-13611001-png/images/ftp_active_mode-resized-600.png" />

**Passive FTP Mode**
- The client connects from a random port to port 21 on the server
- The client connects from another random port to the random port specified in the server's response
<img src="https://www.jscape.com/hs-fs/hub/26878/file-13611186-png/images/ftp_passive_mode-resized-600.png" />

**SFTP Mode**
<img src=" https://www.exavault.com/blog/app/uploads/2019/01/Screen-Shot-2019-01-14-at-4.30.46-PM-1024x544.png" width = "600"/>


In [1]:
import paramiko
import math
import boto3
# Get the service client 
client = boto3.client('s3')

import pprint
# prints the formatted representation of PrettyPrinter object
pp = pprint.PrettyPrinter(indent=4)

## Connect to SFTP
- We can create SFTPClient object connected to a computer on which remote file operations can be performed in two ways :-
    1. Paramiko Transport object to establish a connection to the (remote) computer and then create the SFTClient object using the Transport object
    2. Create a Paramiko SSHClient object which is then used to open a SFTP connection and obtain a SFTPClient object.

In [2]:
host = "ec2-13-127-30-239.ap-south-1.compute.amazonaws.com"
port = 22
user = "test_user"
pss = "Rishabh@"
keyfilepath = None
bucket_name = "rishabhsengar2611"

### Paramiko Transport object

In [3]:
# If key is provided, then add the key
if keyfilepath is not None:
# Get private key used to authenticate user.
    if keyfiletype == 'DSA':
# The private key is a DSA type key.
        key = paramiko.DSSKey.from_private_key_file(keyfilepath)
    else:
# The private key is a RSA type key.
        key = paramiko.RSAKey.from_private_key(keyfilepath)

    
# Create Transport object using supplied method of authentication.
transport = paramiko.Transport(host, port)
# add key attribute if provided
transport.connect( username = user, password = pss)
sftp = paramiko.SFTPClient.from_transport(transport)


In [4]:
transport.is_active()

True

### Paramiko SSHClient object

In [22]:
# # If key is provided, then add the key
# if keyfilepath is not None:
# # Get private key used to authenticate user.
#     if keyfiletype == 'DSA':
# # The private key is a DSA type key.
#         key = paramiko.DSSKey.from_private_key_file(keyfilepath)
#     else:
# # The private key is a RSA type key.
#         key = paramiko.RSAKey.from_private_key(keyfilepath)

# # Connect SSH client accepting all host keys.
# ssh = paramiko.SSHClient()
# ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
# # add key attribute if provided
# ssh.connect(host, port, username = user, password = pss)

# # Using the SSH client, create a SFTP client.
# sftp = ssh.open_sftp()
# # Keep a reference to the SSH client in the SFTP client as to prevent the former from
# # being garbage collected and the connection from being closed.
# sftp.sshclient = ssh

# # Close SFTP
# sftp.close()
# # Close SSH
# ssh.close()

## Some Manipulations

### Retriving list of files

In [5]:
# List files in the directory on the remote computer.
dirlist = sftp.listdir('./ftp_files')
for row in dirlist:
    print(row)

costs.csv
date_1.csv
Measurement_info.csv
Measurement_summary.csv
MOCK_DATA.json
NoSQL.odt
people.json
WMM6GGSTIQ.csv


### Get only CSV files 

In [6]:
def filter_names(n) :
    if n.endswith('csv') :
        return True
    else :
        return False

ftp_files = list(filter( filter_names, dirlist ))

# Printing
pp.pprint(ftp_files)

[   'costs.csv',
    'date_1.csv',
    'Measurement_info.csv',
    'Measurement_summary.csv',
    'WMM6GGSTIQ.csv']


### Files which are already present in S3

In [7]:
try :
    # Returns some or all of the objects in a bucket
    s3_files = client.list_objects_v2(Bucket = 'rishabhsengar2611')
except :
    # if wrong bucket name is entered
    print("No such Bucket \n")

pp.pprint(s3_files)

{   'Contents': [   {   'ETag': '"a0ad38f3a09a0d0d0c2dd905736e6986"',
                        'Key': '9G9VYW23CU.csv',
                        'LastModified': datetime.datetime(2020, 4, 18, 13, 49, 54, tzinfo=tzutc()),
                        'Size': 354,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"571d1690191e982850f6e593049ff428"',
                        'Key': 'D029LRLIRA.csv',
                        'LastModified': datetime.datetime(2020, 4, 22, 11, 43, 19, tzinfo=tzutc()),
                        'Size': 341,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"8d2a79db85621494aa0c00583b0058cc"',
                        'Key': 'DN7A49XY69.csv',
                        'LastModified': datetime.datetime(2020, 4, 22, 12, 37, 31, tzinfo=tzutc()),
                        'Size': 467,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"f903d3cf0c7f126fa9f939c3e07ee66d"',
   

In [8]:
# Dictionary Access to get the name of the first file
s3_files['Contents'][0]['Key']

'9G9VYW23CU.csv'

In [9]:
# Dictionary Access to get the size of the first file
s3_files['Contents'][0]['Size']

354

In [10]:
### Files which are already present in S3

try :
    # Returns some or all of the objects in a bucket
    s3_files = client.list_objects_v2(Bucket = 'rishabhsengar2611')
except :
    # if wrong bucket name is entered
    print("No such Bucket \n")

pp.pprint(s3_files)

# Dictionary Access to get the name of the first file
s3_files['Contents'][0]['Key']

# Dictionary Access to get the size of the first file
s3_files['Contents'][0]['Size']

{   'Contents': [   {   'ETag': '"a0ad38f3a09a0d0d0c2dd905736e6986"',
                        'Key': '9G9VYW23CU.csv',
                        'LastModified': datetime.datetime(2020, 4, 18, 13, 49, 54, tzinfo=tzutc()),
                        'Size': 354,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"571d1690191e982850f6e593049ff428"',
                        'Key': 'D029LRLIRA.csv',
                        'LastModified': datetime.datetime(2020, 4, 22, 11, 43, 19, tzinfo=tzutc()),
                        'Size': 341,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"8d2a79db85621494aa0c00583b0058cc"',
                        'Key': 'DN7A49XY69.csv',
                        'LastModified': datetime.datetime(2020, 4, 22, 12, 37, 31, tzinfo=tzutc()),
                        'Size': 467,
                        'StorageClass': 'STANDARD'},
                    {   'ETag': '"f903d3cf0c7f126fa9f939c3e07ee66d"',
   

354

### Get S3 file names and there size

In [11]:
# Used to store the s3 files names
s3_files_names = []

# Used to store the sizes of files
s3_files_size = []

print("File Name \t\t Size\n")
# Print the files
try :
    for i in range( 0,len(s3_files['Contents']) ) :
        s3_files_names.append(s3_files['Contents'][i]['Key'])
        print(s3_files['Contents'][i]['Key'], end='\t\t')
        s3_files_size.append(s3_files['Contents'][i]['Size'])
        print(s3_files['Contents'][i]['Size'], end='\n')
        
except KeyError :
    print("Bucket is Empty")


File Name 		 Size

9G9VYW23CU.csv		354
D029LRLIRA.csv		341
DN7A49XY69.csv		467
EDTMQD3VFB.csv		237
GJNZEL7QS7.csv		316
MOCK_DATA.json		287
SG75B3AMDD.csv		387
WMM6GGSTIQ.csv		277
file.txt		319
people.json		1612897


### Remove already present file in the s3

In [12]:
# Already present files
s3_files_present = []

def filter_names(n) :
    if n in s3_files_names :
        s3_files_present.append(n)
        return False
    else :
        return True

ftp_files = list(filter( filter_names, ftp_files ))


print(" To be Added")
pp.pprint(ftp_files)
print("\n Already Present")
pp.pprint(s3_files_present)

 To be Added
['costs.csv', 'date_1.csv', 'Measurement_info.csv', 'Measurement_summary.csv']

 Already Present
['WMM6GGSTIQ.csv']


In [13]:
# Make the Dictionary of filename and Size of the files present in FTP 
ftp_size = {}
for i in ftp_files :
# stat(path) ---> Retrieve information about a file on the remote system
    destination = './ftp_files/' + i
    info = sftp.stat(destination)
    ftp_size[i] = info.st_size

pp.pprint(ftp_size)

{   'Measurement_info.csv': 124452984,
    'Measurement_summary.csv': 94076158,
    'costs.csv': 612,
    'date_1.csv': 749}


## Adding files to S3

### Files which have size less than 6MB uploaded

In [14]:
# Each part must be at least 5 MB in size
# Since AWS won't allow us to have size less than 5MB
# 1024*1024*6 == 6MB
chunk_size = 6291456

In [18]:
ftp_file_path = './ftp_files/' + list(ftp_size.keys())[1]
print(ftp_file_path)

ftp_file = sftp.file(ftp_file_path, 'r')

./ftp_files/date_1.csv


In [19]:
ftp_file_data = ftp_file.read()

In [20]:
ftp_file = sftp.file(ftp_file_path, 'r')
if list(ftp_size.values())[0] <= int(chunk_size): 
    #upload file in one go 
    print('Transferring complete File from FTP to S3...')
#     ftp_file_data = ftp_file.read()
    client.upload_fileobj(Fileobj=ftp_file ,Bucket = bucket_name ,Key = list(ftp_size.keys())[0]) 
    print('Successfully Transferred file from FTP to S3!') 
    ftp_file.close()

Transferring complete File from FTP to S3...
Successfully Transferred file from FTP to S3!


### Files to be uploaded in chunks

In [46]:
# Used to store the large files names
big_files = []

# This for loop is used to upload files
for i in ftp_size :
    if ftp_size[i] > chunk_size :
        big_files.append(i)
        
# Big files are :-        
print(big_files)
big_file = big_files[1]

['Measurement_info.csv', 'Measurement_summary.csv']


In [47]:
print("\nsize of {} is {} bytes".format(big_file,ftp_size[big_file]))

chunk_count = int(math.ceil(ftp_size[big_file] / float(chunk_size)))
print("\nThere are {} chunks in which {} is broken down.\n".format(chunk_count,big_file))

ftp_file_path = './ftp_files/' + big_file
print("\nThe Path from which file to be taken is :: {}\n".format(ftp_file_path))
ftp_file = sftp.file(ftp_file_path, 'r')


size of Measurement_summary.csv is 94076158 bytes

There are 15 chunks in which Measurement_summary.csv is broken down.



 #### Some Theory
* There are few thing we should know before uploading files in chunks
    - [create_multipart_upload()](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.create_multipart_upload) 
        - this method initialtes the multipart upload opertaion and returns the Upload ID which is used to associate all the parts uploaded using this method. This Upload ID is specified in every part of the chunk.
        
             **NOTE** 
            - _After you initiate a multipart upload and upload one or more parts, to stop being charged for storing the uploaded parts, you must either complete or abort the multipart upload. Amazon S3 frees up the space used to store the parts and stop charging you for storing them only after you either complete or abort a multipart upload._
            
    - [abort_multipart_upload()](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.abort_multipart_upload)
        - This method is used to abort the multipart upload operation. After executing this method the storage consumed by any previously uploaded parts will be freed. 
        ```python
            response = client.abort_multipart_upload(
                        Bucket= bucket_name,
                        Key=big_file,
                        UploadId=multipart_upload_ID['UploadId'])
            print(response)
            
        ```
         **NOTE**
        - _To verify that all parts have been removed, so you don't get charged for the part storage, you should call the ListParts operation and ensure that the parts list is empty._
  
    - [complete_multipart_upload()](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.complete_multipart_upload)
        - This method is used to assemble the previously uploaded parts with the given Upload ID.Upon receiving this request, Amazon S3 concatenates all the parts in ascending order by part number which is provided in part list along with is Entity Tag.
        
        **NOTE**
        - Part List must be provided with the following structure
        ```python
           {
            'Parts': 
               [   {
                    'ETag': 'string',
                    'PartNumber': 123
               }   ]
           }
        ```
    - [upload_part()](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.upload_part)

        - This method is used to Upload a part one by one.
        - _If you upload a new part using the same part number that was used with a previous part, the previously uploaded part is overwritten._

In [26]:
# It initiates a multipart upload and returns an upload ID
# upload ID is used to associate all of the parts in the specific multipart upload
multipart_upload_ID = client.create_multipart_upload(Bucket = bucket_name, Key = big_file)

parts = []

# ftp_file_path--> the path from the root directory of the ftp server to the file( filename Included )
ftp_file_path = './ftp_files/' + big_file
print(ftp_file_path)

# Open a file on the remote server
# A file-like object is returned, which closely mimics the behavior of a normal Python file object
ftp_file = sftp.file(ftp_file_path, 'r')


for  i in range(chunk_count):
    print('\nTransferring chunk {}...'.format(i + 1))
    part_number = i+1
    
    if transport.is_active() :
        print("Connection not Dropped\n")
    else :
        print("Connection Dropped\n")
    # Create Transport object using supplied method of authentication.
        transport = paramiko.Transport(host, port)
    # add key attribute if provided
        transport.connect( username = user, password = pss)
        sftp = paramiko.SFTPClient.from_transport(transport)
        print("Connected again \n")

    print("Reading the chunk\n")
    
    # Read up to length bytes from the given file
    chunk = ftp_file.read(int(chunk_size))
    
    print("Uploading the chunk\n")
    # It uploads a part in a multi-part upload
    part = client.upload_part(
        Bucket = bucket_name,
        Key = big_file,
        # PartNumber uniquely identifies a part and its position
        PartNumber = part_number,
        # Upload ID identifies the multipart_upload whose part is being uploaded.
        UploadId = multipart_upload_ID['UploadId'],
        Body = chunk
        )
    
    # ETag ---> Entity tag for the uploaded object
    part_output = {'PartNumber': part_number,'ETag': part['ETag'] }
    pp.pprint(part_output)
    
    parts.append(part_output)
    print('Chunk {} Transferred Successfully!'.format(i + 1))


./ftp_files/Measurement_summary.csv
Transferring chunk 1...
Connection not Dropped

Reading the chunk

Uploading the chunk

{'ETag': '"c3e7bf42966af875af1ee617d77c5d3e"', 'PartNumber': 1}
Chunk 1 Transferred Successfully!
Transferring chunk 2...
Connection not Dropped

Reading the chunk

Uploading the chunk

{'ETag': '"cb52830a808fac50697eddf3cab88ca1"', 'PartNumber': 2}
Chunk 2 Transferred Successfully!
Transferring chunk 3...
Connection not Dropped

Reading the chunk

Uploading the chunk

{'ETag': '"60bfb964d720d6a92e3a125fbc1a765e"', 'PartNumber': 3}
Chunk 3 Transferred Successfully!
Transferring chunk 4...
Connection not Dropped

Reading the chunk

Uploading the chunk

{'ETag': '"c6d6223db9cd9850ea8a0447f3773e4d"', 'PartNumber': 4}
Chunk 4 Transferred Successfully!
Transferring chunk 5...
Connection not Dropped

Reading the chunk

Uploading the chunk

{'ETag': '"0ac135fc4d0cc8e6a3e28e8f3e936a6d"', 'PartNumber': 5}
Chunk 5 Transferred Successfully!
Transferring chunk 6...
Connection

In [27]:
# In Complete_Multipart_Upload request,we must provide the parts list
part_list = { 'Parts': parts }

# It completes a multipart_upload by assembling previously uploaded parts.
client.complete_multipart_upload(
            Bucket = bucket_name,
            Key = big_file,
            UploadId = multipart_upload_ID['UploadId'],
            MultipartUpload = part_list
            )
print('\nAll chunks Transferred to S3 bucket! File Transfer successful!\n')

All chunks Transferred to S3 bucket! File Transfer successful!


In [51]:
# Must be done
# Close the file opened in the remote location
ftp_file.close()

In [50]:
transport.is_active()

True

In [49]:
pp.pprint(part_info)

{   'Parts': [   {   'ETag': '"c3e7bf42966af875af1ee617d77c5d3e"',
                     'PartNumber': 1},
                 {   'ETag': '"cb52830a808fac50697eddf3cab88ca1"',
                     'PartNumber': 2},
                 {   'ETag': '"60bfb964d720d6a92e3a125fbc1a765e"',
                     'PartNumber': 3},
                 {   'ETag': '"c6d6223db9cd9850ea8a0447f3773e4d"',
                     'PartNumber': 4},
                 {   'ETag': '"0ac135fc4d0cc8e6a3e28e8f3e936a6d"',
                     'PartNumber': 5},
                 {   'ETag': '"e425c7cd789621e31b28b265ae24b5e3"',
                     'PartNumber': 6},
                 {   'ETag': '"2b2571f9dd170dd2ec7cad27fb7e40a8"',
                     'PartNumber': 7},
                 {   'ETag': '"e2de8ec79030a9cd60321f72069d35f9"',
                     'PartNumber': 8},
                 {   'ETag': '"27813daa204e26d522861b5f67a870a8"',
                     'PartNumber': 9},
                 {   'ETag': '"e577cb9212c8ffc

In [None]:
# Close SFTP
sftp.close()
# Close tansport
transport.close()

In [9]:
with open("/home/bluepi/Documents/file.txt", "r") as f:
    print(f.read(6))
    print(f.read(6))

qqqqqq

wwwww
